COD_20230330_HOST_1_QC_analysis

Minsik Kim

2023-03-30

Loading packages

#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
#   knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
knitr::opts_chunk$set(message=FALSE, warning = FALSE)

path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c(
    "readxl", "phyloseq", "tidyverse", "pacman", "yaml"
)

path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c("readxl", "phyloseq", "tidyverse", "pacman", "yaml", "ggplot2", "vegan", "microbiome", "ggpubr", "viridis", "decontam", "gridExtra", "ggpubr", "lme4", "lmerTest", "writexl", "harrietr", "Maaslin2", "ggtext", "ggpmisc", "gridExtra", "gamm4", "reshape2", "kableExtra", "knitr", "ggtree", "car")
        
YAML_header <-
'---
title: "Host-DNA depletion 1: data wrangling"
author: "Minsik Kim"
date: "2032.03.10"
output:
    rmdformats::downcute:
        downcute_theme: "chaos"
        code_folding: hide
        fig_width: 6
        fig_height: 6
---'
seed <- "20230330"

#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
    .libPaths(path_library)

    require(pacman)
    pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))

    knitr::opts_knit$set(root.dir = path_working)

    str_libraries |> unique() |> sort() -> str_libraries
    pacman::p_load(char = str_libraries)

    set.seed(seed)
}
FUN.LineZero.Boot()
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
    cat("Line Zero Environment:\n\n")
    paste("R:", pacman::p_version(), "\n") |> cat()
    cat("Libraries:\n")
    for (str_libraries in str_libraries) {
        paste(
            "    ", str_libraries, ": ", pacman::p_version(package = str_libraries),
            "\n", sep = ""
        ) |> cat()
    }
    paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
    paste("    Library Path:", path_library, "\n") |> cat()
    paste("    Working Path:", path_working, "\n") |> cat()
    paste("Seed:", seed, "\n\n") |> cat()
    cat("YAML Header:\n")
    cat(YAML_header)
}
FUN.LineZero.Report()
## Line Zero Environment:
## 
## R: 4.2.2 
## Libraries:
##     readxl: 1.4.2
##     phyloseq: 1.40.0
##     tidyverse: 2.0.0
##     pacman: 0.5.1
##     yaml: 2.3.7
##     ggplot2: 3.4.1
##     vegan: 2.6.4
##     microbiome: 1.18.0
##     ggpubr: 0.6.0
##     viridis: 0.6.2
##     decontam: 1.16.0
##     gridExtra: 2.3
##     ggpubr: 0.6.0
##     lme4: 1.1.31
##     lmerTest: 3.1.3
##     writexl: 1.4.2
##     harrietr: 0.2.3
##     Maaslin2: 1.10.0
##     ggtext: 0.1.2
##     ggpmisc: 0.5.2
##     gridExtra: 2.3
##     gamm4: 0.2.6
##     reshape2: 1.4.4
##     kableExtra: 1.3.4
##     knitr: 1.42
##     ggtree: 3.4.4
##     car: 3.1.1
## 
## Operating System: Darwin 
##     Library Path: /Library/Frameworks/R.framework/Resources/library 
##     Working Path: /Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git 
## Seed: 20230330 
## 
## YAML Header:
## ---
## title: "Host-DNA depletion 1: data wrangling"
## author: "Minsik Kim"
## date: "2032.03.10"
## output:
##     rmdformats::downcute:
##         downcute_theme: "chaos"
##         code_folding: hide
##         fig_width: 6
##         fig_height: 6
## ---

Script description

1. Loading data

1.1. phyloseq obejct

1.2. qPCR data (controls)

2. QC

QC1. How many samples failed sequencing

QC2. How were changes in read stats and host DNA proportion?

QC3. How were the extraction controls

QC4. Prevalence / abundance filtering - red flag

3. Analysis

A0. Calculation of alpha-diversity indices

A1. Host DNA, bacterial DNA and % host

A2. Modeling of sequencing results

A3. Taxa alpha diversity

A4. Taxa beta diversity

Intermediate results

A5. DA analysis for taxa

A6. Decontam

A7. LM of function alpha diversity (BPI)

A8. permanova of function alpha diversity

A9. DA for function

Data inputs

Meta data

  • qPCR - bacteria

  • qPCR - human

  • qPCR host %

  • Raw reads

  • final reads

  • sequencing host %

  • library prep failure status

  • Raw reads

  • subject_id

  • treatment

  • sample_type

  • subject_id

Sequencing result

  • samples

  • controls

Loading data

# Loading files -----------------------------------------------------------
#loading tidy phyloseq object
phyloseq <- read_rds("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/4_Data/2_Tidy/Phyloseq/PHY_20221129_MGK_host_tidy_tax.rds")

        ####Need to be removed after adding sequenicing data
                data_qPCR <- read_csv("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/7_Manuscripts/2022_MGK_Host_Depletion/Tables/DAT_20230122_MGK_host_control_qPCR.csv")
                #qPCR data of all the samples sent out sequencing
                data_qPCR <- subset(data_qPCR, data_qPCR$baylor_other_id %in% c(sample_names(phyloseq$phyloseq_count)) | data_qPCR$baylor_other_id %in% c(read_excel("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_Baylor/3_Documentation/Communications/2023-01-24_baylor_shipping_sicas2_nasal_host_depleted/CMMR_MetadataCapture_20230124_LaiP-PQ00430_SICAS2_NS.xlsx", skip = 27) %>% data.frame %>% .$`Optional..............secondary.ID`))
                
                data_qPCR <- subset(data_qPCR, data_qPCR$sample_type %in% c("BAL", "nasal_swab", "Sputum", "neg_depletion", "pos_depletion"))
                data_qPCR$sample_type <- factor(data_qPCR$sample_type, levels = c("BAL", "nasal_swab", "Sputum", "pos_depletion", "neg_depletion"),
                                                labels = c("BAL", "Nasal swab", "Sputum", "P depletion", "N depletion"))
                data_qPCR$treatment <- factor(data_qPCR$treatment, levels = c("control", "lyPMA", "benzonase", "host_zero", "molysis", "qiaamp"),
                                                labels = c("Control", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"))

#sample data loading
sample_data <- sample_data(phyloseq$phyloseq_count)

Q1. How were sequencing results?

Histogram of reads counts and host %

Figure - regular scale

Raw scale is not normally distributed

# Initail QC --------------------------------------------------------------
        #Quesetions - QC
        
#Q0. How many samples failed in sequencing

## figures -----raw data

sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        mutate(host_seq_percent = sequencing_host_prop * 100, 
               .after = sequencing_host_prop,) %>% 
        gather(feature, value, Raw_reads:host_seq_percent) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent")) %>%
        mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
        ggplot(aes(x = value, fill = treatment)) +
                geom_histogram(bins = 97) +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("log10 transfromed histrogram") +
                theme_classic() +
                theme(legend.position = "top") 

Figure - log10 scale

log transform is adquate for read counts

Host% is not transfromed well

## figures -----log10
sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        mutate(host_seq_percent = 100 * sequencing_host_prop, 
               .after = sequencing_host_prop,) %>% 
        gather(feature, value, Raw_reads:host_seq_percent) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent")) %>%
        mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
        ggplot(aes(x = log10(value), fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("log10 transformed") +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                theme_classic() +
                theme(legend.position = "top")

Figure - scaling host proportion

Raw % will be used for host%

## figures -----log10
sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        mutate(host_seq_percent = 100 * sequencing_host_prop, 
               log_seq_percent = log10(host_seq_percent), 
               sqrt_seq_percent = sqrt(host_seq_percent), 
               .after = sequencing_host_prop,) %>% 
        gather(feature, value, Raw_reads:sqrt_seq_percent) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent")) %>%
        mutate(feature = factor(feature, levels = c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent"), labels = c("Host %", "log10 (host %)", "Sqrt(host %)")))  %>% 
        ggplot(aes(x = value, fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("Host % transfromed (raw, log10, and sqrt) histrogram") +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                theme_classic() +
                theme(legend.position = "top")

Figure - log10 scale by treatment

ggarrange(ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = Final_reads, fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_wrap(~sample_type) +
                theme_classic(base_family = "serif") +
                ggtitle("Histogram of final reads by sample type and treatment") +
                scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment"),
             ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = log10(Final_reads), fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_wrap(~sample_type) +
                theme_classic(base_family = "serif") +
                ggtitle("Histogram of log10(final reads) by sample type and treatment") +
                scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment"),
          common.legend = T, ncol = 1)

Histogram (sum of OTU table)

2 samples showed 0 reads in sum(OTU)

hist((log10((phyloseq$phyloseq_count %>% otu_table %>% colSums()) + 1)),100, main = "Histogram of total reads (sum of OTU table)") # 2 samples showed 0 total reads (sum of otu_table)

Final reads of library prep failed samples

Some samples did not pass library prep QC, but showed reasonable final reads

#how were the samples failed in library prep?
sample_data %>% data.frame %>% mutate(total_read = phyloseq$phyloseq_count %>% otu_table %>% colSums()) %>% rownames_to_column(var = "baylor_other_id") %>%
        ggplot(aes(x = reorder(baylor_other_id, -total_read),
                               y = log10(total_read + 1),
                               col = lib_failed)) +
                geom_point() +
                theme_classic(base_family = "serif") +
                theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
                ylab("log<sub>10</sub>(Sum of OTU table reads)") +
                xlab("Sample ID") +
        guides(col=guide_legend(title="Library failed")) +
        ggtitle("Sum of OTU reads by library failure status")

List of samples failed in sequencing

2 BAL samples (control and lyPMA group) failed in sequencing

sample_data %>% data.frame %>% filter(phyloseq$phyloseq_count %>% otu_table %>% colSums() == 0) # two BAL sampels showed 0 total reads
#sample_data(phyloseq$phyloseq_count) %>% data.frame() %>% subset(., .$lib_failed)

QC 1 Results:

1.1 Modeling final read should be conducted with log transfrom. Host % need no transformation.

1.2 13 samples failed in library prep

1.3. Two BAL sampels showed 0 total reads

1.4. Sequencing fail ≠ library prep fail

Comments from Baylor:

Q: What was the lab’s criteria for deciding which samples failed library prep.? There were 13 samples that you pointed as failed but their sequencing result actually looks pretty good (ie similar to samples that didn’t fail library prep)

A: To determine whether a library attempt “passed or failed” the lab looks at the picogreen concentrations and a library fragment size distribution trace. The trace files are an output from either the Fragment Analyzer or TapeStation (a copy of the trace files for PQ00331 is attached). If a sample has a background level pico concentration and no discernable fragment concentration on the trace (i.e. a flat trace line) it is considered failed library. If the sample is below the level of detection of our standard library QC methods, it is considered failure. It’s still possible that there is some small amounts of library in those samples that were successfully sequenced, but often those samples do not generate a meaningful amount of sequence data.

QC2 Chagnes of reads and host % by treatment

For detailed analysis, sequencing matrices were analyzed by each sample type and treatment

Reads and host % by treatment

QC table by treated (binary)

Changes in matrices were observed

#sequencing result by sample type and control (1/0)
options(dplyr.summarise.inform = FALSE)

sample_data %>% data.frame() %>% 
        group_by(sample_type, treated) %>% 
        summarise(N = n(),
                  `Raw reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Raw_reads/10000000),2),nsmall = 2, big.mark = ","), " [", format(round(quantile(Raw_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Raw_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Host reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Host_mapped/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Host_mapped/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Host_mapped/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Host reads proportion<br>(median [IQR])<br>[%]` = paste(format(round(median(sequencing_host_prop * 100),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(sequencing_host_prop * 100, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop * 100, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Final reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
        ) %>%
        rename(`Sample type` = sample_type, Treated = treated) %>%
        data.frame(check.names = F) %>% mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
Sample type Treated N Raw reads
(median [IQR])
[reads x 107]
Host reads
(median [IQR])
[reads x 107]
Host reads proportion
(median [IQR])
[%]
Final reads
(median [IQR])
[reads x 107]
BAL 0 5 15.73 [6.35, 15.92] 12.92 [5.21, 12.94] 82.14 [82.12, 82.18] 0.03 [0.03, 0.04]
BAL 1 25 6.17 [4.57, 17.43] 4.65 [2.78, 12.80] 75.59 [69.38, 78.44] 0.17 [0.10, 0.37]
Nasal swab 0 10 13.09 [7.73, 16.93] 10.05 [6.11, 13.04] 77.34 [76.89, 79.45] 0.48 [0.10, 0.87]
Nasal swab 1 25 4.08 [0.99, 6.40] 0.81 [0.26, 1.36] 27.46 [13.51, 64.58] 0.97 [0.17, 3.42]
Sputum 0 5 8.59 [8.25, 9.27] 6.87 [6.69, 7.50] 80.61 [79.92, 80.89] 0.06 [0.06, 0.09]
Sputum 1 25 12.23 [10.34, 13.73] 7.71 [3.76, 8.82] 58.64 [39.71, 74.93] 1.16 [0.47, 4.19]
Neg. ext. 0 1 0.02 [0.02, 0.02] 0.00 [0.00, 0.00] 3.13 [3.13, 3.13] 0.02 [0.02, 0.02]
Pos. ext. 0 1 11.87 [11.87, 11.87] 0.00 [0.00, 0.00] 0.02 [0.02, 0.02] 9.96 [9.96, 9.96]

QC table by treatment methods

Changes were sample type * treatment specific

sample_data %>% data.frame() %>% 
        #dplyr::filter(sample_type %in% c("Sputum", "nasal_swab", "BAL")) %>% 
        group_by (sample_type, treatment) %>%
        summarise(N = n(),
              `Raw reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Raw_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Raw_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Raw_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Host reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Host_mapped/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Host_mapped/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Host_mapped/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Host reads proportion<br>(median [IQR])<br>[%]` = paste(format(round(median(sequencing_host_prop * 100),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(sequencing_host_prop * 100, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop * 100, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Final reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
        ) %>% data.frame(check.names = F) %>% 
        arrange(sample_type, treatment) %>%
        rename(`Sample type` = sample_type, Treatment = treatment) %>%
        mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
Sample type Treatment N Raw reads
(median [IQR])
[reads x 107]
Host reads
(median [IQR])
[reads x 107]
Host reads proportion
(median [IQR])
[%]
Final reads
(median [IQR])
[reads x 107]
BAL Control 5 15.73 [6.35, 15.92] 12.92 [5.21, 12.94] 82.14 [82.12, 82.18] 0.03 [0.03, 0.04]
BAL lyPMA 5 5.72 [3.59, 13.41] 4.65 [2.79, 10.90] 80.73 [77.85, 81.18] 0.06 [0.04, 0.10]
BAL Benzonase 5 18.59 [16.20, 23.63] 14.77 [12.80, 18.16] 79.02 [77.89, 79.45] 0.17 [0.16, 0.22]
BAL Host zero 5 4.57 [2.32, 4.71] 2.69 [1.61, 2.93] 66.98 [57.17, 68.95] 0.24 [0.13, 0.82]
BAL Molysis 5 4.76 [3.57, 4.86] 2.78 [1.39, 3.61] 74.39 [74.29, 75.59] 0.29 [0.13, 1.56]
BAL QIAamp 5 17.19 [15.35, 17.43] 11.87 [10.79, 12.22] 74.88 [71.08, 77.31] 0.26 [0.10, 1.02]
Nasal swab Control 10 13.09 [7.73, 16.93] 10.05 [6.11, 13.04] 77.34 [76.89, 79.45] 0.48 [0.10, 0.87]
Nasal swab lyPMA 5 0.98 [0.85, 1.24] 0.63 [0.28, 0.88] 71.37 [28.73, 74.68] 0.07 [0.06, 0.08]
Nasal swab Benzonase 5 5.75 [4.95, 6.57] 3.66 [1.29, 5.05] 64.58 [63.71, 76.83] 0.28 [0.26, 1.04]
Nasal swab Host zero 5 2.83 [1.42, 6.42] 0.49 [0.03, 0.81] 7.67 [2.33, 25.73] 2.43 [0.97, 5.03]
Nasal swab Molysis 5 0.99 [0.63, 4.08] 0.42 [0.06, 0.64] 41.04 [4.31, 64.24] 0.32 [0.17, 2.53]
Nasal swab QIAamp 5 6.40 [6.40, 6.80] 0.86 [0.86, 1.17] 17.24 [13.51, 19.57] 4.63 [4.50, 4.67]
Sputum Control 5 8.59 [8.25, 9.27] 6.87 [6.69, 7.50] 80.61 [79.92, 80.89] 0.06 [0.06, 0.09]
Sputum lyPMA 5 10.98 [5.22, 12.78] 8.82 [3.76, 10.44] 76.33 [72.02, 80.29] 0.25 [0.15, 0.44]
Sputum Benzonase 5 10.76 [10.34, 10.82] 7.81 [7.75, 8.24] 74.93 [72.13, 75.33] 0.47 [0.45, 0.59]
Sputum Host zero 5 13.14 [7.64, 13.95] 4.39 [3.80, 7.71] 49.71 [31.45, 55.49] 2.91 [2.36, 3.67]
Sputum Molysis 5 12.59 [10.84, 13.73] 2.98 [1.83, 4.28] 27.49 [14.62, 28.19] 6.11 [5.56, 8.37]
Sputum QIAamp 5 12.35 [12.23, 12.85] 9.08 [8.41, 9.27] 71.76 [56.69, 73.50] 1.16 [1.13, 3.89]
Neg. ext. Control 1 0.02 [0.02, 0.02] 0.00 [0.00, 0.00] 3.13 [3.13, 3.13] 0.02 [0.02, 0.02]
Pos. ext. Control 1 11.87 [11.87, 11.87] 0.00 [0.00, 0.00] 0.02 [0.02, 0.02] 9.96 [9.96, 9.96]

Figure of reads by treatment (z-score)

Changes were sample type * treatment specific

# Summary figures - facet and z-score -------------------------------------

sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        gather(feature, value, Raw_reads:sequencing_host_prop) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
        mutate(z_score = scale(value),
               feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
        ggplot(aes(x = treatment, y = z_score, fill = treatment)) +
                geom_boxplot(lwd = 0.2) +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                facet_grid(sample_type~feature) +
                xlab("Treatment") +
                ylab("Z score") +
                theme_classic(base_family = "serif", base_size = 14) +
                guides( x =  guide_axis(angle = 90)) + 
                theme(legend.position = "top") +
                scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment") #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6

Results:

2.1. There were no differences in raw reads.

2.2. However, final reads increased after some treatment, and host DNA proportion decreased

QC3. Positive and negative controls

Positive and negative controls were compared with mock community

Reads and host % by treatment

Species richness of controls

Some possible contaminants were identified in extraction controls

#Loading theoretical mock community

zymo_mock <- read_excel("/Users/minsikkim/Dropbox (Partners HealthCare)/@minsik/project_sicas2/data_raw/DAR_20210929_zymo_mock_data.xlsx") %>%
        data.frame(row.names = T) %>% rename(mock_theoretical = Mock) %>% mutate(mock_theoretical = mock_theoretical/100) %>%
        merge_phyloseq(otu_table(., taxa_are_rows = T), tax_table(phyloseq$phyloseq_count))

phyloseq_control_rel <- rbind(c("mock_theoretical", "mock")) %>% data.frame() %>%
        column_to_rownames(var = "X1") %>% rename(sample_type = X2) %>% #making sample_data of mock community
        merge_phyloseq(sample_data(.), zymo_mock) %>%        
        merge_phyloseq(., subset_samples(phyloseq$phyloseq_rel, sample_type == "Pos. ext." | sample_type == "Neg. ext." )) #adding data of controls


#Species richness of each control groups

rowSums(t(otu_table(phyloseq_control_rel)) != 0) %>% kbl(format = "html", caption = "Species richness of controls") %>% 
        kable_styling(full_width = 0, html_font = "serif")
Species richness of controls
x
mock_theoretical 10
20220606_Pos 41
20220606_Neg 3

Bar plot of controls

Some possible contaminants were identified in extraction controls

#Making label for the controls
sample_data(phyloseq_control_rel)$sample_type <- factor(sample_data(phyloseq_control_rel)$sample_type,
                                                        levels = c("mock", "2", "1"),
                                                        labels = c("Mock, 10 spp.", "Pos., 41 spp.", "Neg., 3 spp."))


#Manipulating phyloseq - only top 10 

tax_table(phyloseq_control_rel) %>%
  cbind(species10 = "[Others]") %>%
  {top10species <- head(taxa_sums(phyloseq_control_rel), 10) %>% names()
   .[top10species, "species10"] <- as.character(.[top10species, "Species"])
   .[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq_control_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="species10") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Species richness of control data") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ sample_type, scales= "free_x", nrow=1)

#there could be opportunistic pathogens...

Results

2.3.1. Negative control showed minimal number of possible contaminants

2.3.2. Positive control contained various contaminants

QC4. Prevalence and abundacne filtering - red flag

Taxa prevance and abundance were checked.

Taxa abundance and prevalence

Histogram of prelanence taxa

No prevalence or abundance filtering (each experimental group is 5% of total sample)

#Calculation of sample prevalence, standard deviation, median abundance across all samples for all bugs and making into a table.
#
#•  In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differential abundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)
taxa_qc <- data.frame("species" =  otu_table(phyloseq$phyloseq_rel) %>% t() %>% colnames(),
        "prevalence" = ifelse(phyloseq$phyloseq_count %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
        "mean_rel_abd" = phyloseq$phyloseq_rel %>% otu_table() %>% t() %>% colMeans(na.rm = T) #mean relativ abundacne 
)


hist(log10(taxa_qc$prevalence), xlab = "log10(Taxa prevalence)", main = "Histogram of prevalence of taxa")

Histogram of mean abundance

hist(log10(taxa_qc$mean_rel_abd), xlab = "log10(Mean relative abundance)", main = "Histogram of mean relative abundance")

Red flag taxa

Taxa with low prevalences were red-flagged

red_flag_taxa <- data.frame(species = taxa_qc$species,
                          red_flag_prev_abd = ifelse(taxa_qc$prevalence < otu_table(phyloseq$phyloseq_rel) %>% t %>% rownames() %>% length * 0.05 & taxa_qc$mean_rel_abd < quantile(taxa_qc$mean_rel_abd, 0.75), 1, 0))
red_flag_taxa

QC 3 results:

3.1. In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differentialabundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)

3.2. Red flags were made for taxa not satisfying the criteria (prev < 0.05 & mean rel < 0.75Q)

3.3. Although we don’t consider the prevalence of abundance at this time, we can consider their red-flags after running the DA analysis

Analysis

Before anlayzing, alpha diversity indices were calculated for all phyloseq objects

alpha_diversity <- function(data) {
        otu_table <- otu_table(data) %>% .[colSums(.) !=0]
        S.obs <- rowSums(t(otu_table) != 0)
        sample_data <- sample_data(data)
        data_evenness <- vegan::diversity(t(otu_table)) / log(vegan::specnumber(t(otu_table))) # calculate evenness index using vegan package
        data_shannon <- vegan::diversity(t(otu_table), index = "shannon") # calculate Shannon index using vegan package
        data_hill <- exp(data_shannon)                           # calculate Hills index
        
        data_dominance <- microbiome::dominance(otu_table, index = "all", rank = 1, aggregate = TRUE) # dominance (Berger-Parker index), etc.
        data_invsimpson <- vegan::diversity(t(otu_table), index = "invsimpson")                          # calculate Shannon index using vegan package
        alpha_diversity <- cbind(S.obs, data_shannon, data_hill, data_invsimpson, data_evenness,data_dominance) # combine all indices in one data table
        sample_data <- merge(data.frame(sample_data), alpha_diversity, by = 0, all = T) %>% column_to_rownames(var = "Row.names")
}

sample_data(phyloseq$phyloseq_rel) <- sample_data(alpha_diversity(phyloseq$phyloseq_count))
sample_data(phyloseq$phyloseq_count) <- sample_data(alpha_diversity(phyloseq$phyloseq_count)) 
sample_data(phyloseq$phyloseq_path_rpkm) <- sample_data(alpha_diversity(phyloseq$phyloseq_path_rpkm))

A1. Host DNA, bacterial DNA by smaple type and treatment

qPCR and sequencing results

qPCR result

#2A: Change in total DNA (qPCR)


f2a <- ggplot(data_qPCR, aes(x = sample_type, y = log10(DNA_host_nondil + DNA_bac_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("log<sub>10</sub>(qPCR total DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))


#2B: Change in human DNA (qPCR)
f2b <- ggplot(data_qPCR, aes(x = sample_type, y = log10(DNA_host_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33")) +
        ylab("log<sub>10</sub>(qPCR host DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "B") +
        scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#2C: Change in 16S DNA (qPCR)
f2c <- ggplot(data_qPCR, aes(x = sample_type, y = log10(DNA_bac_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33")) +
        ylab("log<sub>10</sub>(qPCR bacterial DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "C") +
        scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))

#2D. Change in % host (qPCR)
f2d <- ggplot(data_qPCR, aes(x = sample_type, y = log10(host_proportion * 100))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33")) +
        ylab("log<sub>10</sub>(host DNA) (%)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "D") +
        scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))

#output for markdown
ggarrange(f2a, f2b, f2c, f2d, common.legend = T , align = "hv")

Figure 2. qPCR result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %

Sequencing result

f3a <- ggplot(subset(sample_data, sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(x = sample_type, y = Raw_reads)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        ylab("Raw reads") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        theme(plot.tag = element_text(size = 15)) +              # Plot title size
        guides(fill = guide_legend(nrow = 1))

#   - Host_mapped


f3b <- ggplot(subset(sample_data, sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(x = sample_type, y = Host_mapped)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        ylab("Host mapped reaeds") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "B") +
        theme(plot.tag = element_text(size = 15)) +              # Plot title size
        guides(fill = guide_legend(nrow = 1))


#   - % Host (we have used Host_mapped/Raw_reads in prior papers)



f3d <- ggplot(subset(sample_data, sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(x = sample_type, y = Host_mapped/Raw_reads)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        ylab("Sequencing host DNA (%)") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "D") +
        theme(plot.tag = element_text(size = 15)) +              # Plot title size
        guides(fill = guide_legend(nrow = 1))


#   - Final_reads

f3c <- ggplot(subset(sample_data, sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(x = sample_type, y = Final_reads)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        ylab("Final reads") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "C") +
        theme(plot.tag = element_text(size = 15)) +              # Plot title size
        guides(fill = guide_legend(nrow = 1))


ggarrange(f3a, f3b, f3c, f3d, common.legend = T, align = "hv")

Figure 3. Sequencing result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %

Results A1

1.1. Some changed were observed, for both host DNA and bacterial DNA.

1.2. Sequencing results need to be added

This will be Fig 2. of the manuscript, after removing positives and negatives

A2. Modeling on sequencing results

As some changed were observed after treatment, linear mixed effect models were employed for testing.

Test results

Library failure - ANOVA

Some samples failed in library prep. What type of sample were fragile to treatments?

glm ( library fail ~ sample_type + treatment + sample_type * treatment + subject_id )

glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        Anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>Chisq)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Chisq Df Pr(>Chisq)
sample_type 14.08825 2 0.0008725
treatment 36.68858 5 0.0000007
sample_type * treatment 34.06531 10 0.0001801

Library failure

glm ( sequencing fail ~ sample_type + treatment + sample_type * treatment + subject_id )

Nasal swabs were fragile to lyPMA and Molysis

glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error t value
(Intercept) 0.0000000 0.1167011 0.0000000
Nasal swab -0.0204626 0.1470919 -0.1391145
Sputum 0.0000000 0.1650403 0.0000000
lyPMA 0.2000000 0.1619675 1.2348160
Benzonase 0.0000000 0.1619675 0.0000000
Host zero 0.2000000 0.1619675 1.2348160
Molysis 0.2000000 0.1619675 1.2348160
QIAamp 0.0000000 0.1619675 0.0000000
Nasal swab * lyPMA 0.5978181 0.2143680 2.7887474
Sputum * lyPMA -0.2000000 0.2290566 -0.8731468
Nasal swab * Benzonase -0.0021819 0.2143680 -0.0101781
Sputum * Benzonase 0.0000000 0.2290566 0.0000000
Nasal swab * Host zero 0.1978181 0.2143680 0.9227971
Sputum * Host zero -0.2000000 0.2290566 -0.8731468
Nasal swab * Molysis 0.6021819 0.2143680 2.8091035
Sputum * Molysis -0.2000000 0.2290566 -0.8731468
Nasal swab * QIAamp 0.0021819 0.2143680 0.0101781
Sputum * QIAamp 0.0000000 0.2290566 0.0000000

Sequencing failure

Modeling of sequencing failure were not available due to low number of cases.

BAL079 - control & lyPMA failed sequencing.

sample_data(phyloseq$phyloseq_count) %>% data.frame %>% mutate(sequencing_fail = (S.obs == 0)) %>%
glmer(sequencing_fail ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = .) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error t value
(Intercept) 0.2 0.0667184 2.997673
Nasal swab -0.2 0.0929713 -2.151203
Sputum -0.2 0.0943541 -2.119675
lyPMA 0.0 0.0797156 0.000000
Benzonase -0.2 0.0797156 -2.508919
Host zero -0.2 0.0797156 -2.508919
Molysis -0.2 0.0797156 -2.508919
QIAamp -0.2 0.0797156 -2.508919
Nasal swab * lyPMA 0.0 0.1057299 0.000000
Sputum * lyPMA 0.0 0.1127349 0.000000
Nasal swab * Benzonase 0.2 0.1057299 1.891613
Sputum * Benzonase 0.2 0.1127349 1.774074
Nasal swab * Host zero 0.2 0.1057299 1.891613
Sputum * Host zero 0.2 0.1127349 1.774074
Nasal swab * Molysis 0.2 0.1057299 1.891613
Sputum * Molysis 0.2 0.1127349 1.774074
Nasal swab * QIAamp 0.2 0.1057299 1.891613
Sputum * QIAamp 0.2 0.1127349 1.774074

log10(Final reads) - ANOVA

Which methods was effective in increasing the final reads?

Interaction term was significant

lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 4.548499 2.2742495 2 9.018238 12.175621 0.0027403
treatment 22.419714 4.4839428 5 68.204420 24.005629 0.0000000
sample_type * treatment 6.545334 0.6545334 10 68.184280 3.504167 0.0009047

log10(Final reads)

Which methods was effective in increasing the final reads?

lmer( log10(Final reads) vs sample_type + treatment + sample_type * treatment + subject_id )

Except lyPMA, every methods increased final reads

lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 5.4933063 0.2095210 62.79820 26.2184086 0.0000000
Nasal swab 1.1246781 0.2789651 37.26869 4.0316080 0.0002633
Sputum 0.3332900 0.2963074 62.79820 1.1248117 0.2649498
lyPMA 0.3521385 0.2733402 67.65016 1.2882794 0.2020376
Benzonase 0.8109049 0.2733402 67.65016 2.9666509 0.0041580
Host zero 0.9501904 0.2733402 67.65016 3.4762193 0.0008929
Molysis 1.0358039 0.2733402 67.65016 3.7894313 0.0003238
QIAamp 1.0480985 0.2733402 67.65016 3.8344107 0.0002788
Nasal swab * lyPMA -0.8774152 0.3621899 68.18953 -2.4225278 0.0180749
Sputum * lyPMA 0.1879364 0.3865614 67.65016 0.4861747 0.6284145
Nasal swab * Benzonase -0.6887274 0.3621899 68.18953 -1.9015642 0.0614539
Sputum * Benzonase 0.0349655 0.3865614 67.65016 0.0904526 0.9281949
Nasal swab * Host zero -0.1157121 0.3621899 68.18953 -0.3194791 0.7503399
Sputum * Host zero 0.7231403 0.3865614 67.65016 1.8706997 0.0657128
Nasal swab * Molysis -0.8411412 0.3621899 68.18953 -2.3223760 0.0232052
Sputum * Molysis 0.9543008 0.3865614 67.65016 2.4686913 0.0160928
Nasal swab * QIAamp 0.0236242 0.3621899 68.18953 0.0652260 0.9481849
Sputum * QIAamp 0.3676901 0.3865614 67.65016 0.9511817 0.3448982

Host % ANOVA

Which methods was effective in lowering host %

Interaction term was significant

lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 1.1399050 0.5699525 2 3.486053 27.593506 0.0072923
treatment 2.1022767 0.4204553 5 68.821547 20.355796 0.0000000
sample_type * treatment 0.9200108 0.0920011 10 68.804812 4.454112 0.0000768

Host %

Which methods was effective in lowering host %

lmer( Host DNA % vs sample_type + treatment + sample_type * treatment + (1|subject_id) )

Host zero was effect to to all types. Molysis was effective to nasal swab and sputum. QIAamp was effective for nasal swab only.

lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x))  %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.8200713 0.0646272 75.67093 12.6892671 0.0000000
Nasal swab -0.0378253 0.0799149 41.00023 -0.4733196 0.6384957
Sputum -0.0293956 0.0913966 75.67093 -0.3216265 0.7486224
lyPMA -0.0377605 0.0908962 68.23467 -0.4154243 0.6791348
Benzonase -0.0324363 0.0908962 68.23467 -0.3568498 0.7223055
Host zero -0.2000025 0.0908962 68.23467 -2.2003393 0.0311715
Molysis -0.1577230 0.0908962 68.23467 -1.7351990 0.0872202
QIAamp -0.0928027 0.0908962 68.23467 -1.0209737 0.3108732
Nasal swab * lyPMA -0.1931415 0.1202628 68.80454 -1.6059959 0.1128550
Sputum * lyPMA 0.0106590 0.1285467 68.23467 0.0829196 0.9341582
Nasal swab * Benzonase -0.1301969 0.1202628 68.80454 -1.0826036 0.2827639
Sputum * Benzonase -0.0432386 0.1285467 68.23467 -0.3363648 0.7376281
Nasal swab * Host zero -0.3924910 0.1202628 68.80454 -3.2636117 0.0017148
Sputum * Host zero -0.1532698 0.1285467 68.23467 -1.1923278 0.2372628
Nasal swab * Molysis -0.2637315 0.1202628 68.80454 -2.1929608 0.0316937
Sputum * Molysis -0.3863225 0.1285467 68.23467 -3.0053090 0.0037094
Nasal swab * QIAamp -0.5240678 0.1202628 68.80454 -4.3576894 0.0000449
Sputum * QIAamp -0.0370057 0.1285467 68.23467 -0.2878775 0.7743131

Results

1. Library failure was associated with nasal swab, especially after lyPMA and Molysis treatment

2. Benzonase, host-zero, Molysis, and QIAamp increased final reads

3. Host-zero lowered host %. For otheres, there were significant sample_type specific treatment efficiencies

A3. LM of taxa alpha diversity

Alpha diversity could be having changes due to treatment.

Both stratified and nonstratified analyses were conducted.

Figure - Alpha diversity

sample_data <- sample_data(phyloseq$phyloseq_count)
f4a <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = S.obs)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Species richness") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))


f4b <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = data_shannon)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Shannon") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "B") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4c <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = data_invsimpson)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Inverse simpson") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "C") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4d <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = dbp)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Berger-Parker index") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "D") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

ggarrange(f4a, f4b, f4c, f4d, common.legend = T, align = "hv") # alpha diversity plots

Species richness

All samples:

S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

S.obs ~ sample_type + log10 (Final_reads) + (1|original_sample)

Species richness (all samples & interaction term) - ANOVA

Interaction term was significant

sample_data <- sample_data(phyloseq$phyloseq_count) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
lmer_sob <- lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_sob %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 7547.487 3773.744 2 10.79325 30.28214 0.0000374
treatment 7333.548 1466.710 5 64.92720 11.76951 0.0000000
log10(Final_reads) 2018.806 2018.806 1 67.56866 16.19977 0.0001465
sample_type:treatment 19463.441 1946.344 10 64.59903 15.61830 0.0000000

Species richness (all samples & interaction term)

Incrase at sputum was at every treatment

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -62.286176 18.946348 73.91996 -3.2875031 0.0015486
Nasal swab -9.894010 13.143390 19.76996 -0.7527746 0.4604558
Sputum 5.066489 11.833705 23.07241 0.4281405 0.6725190
lyPMA -3.032607 7.992068 64.29757 -0.3794521 0.7056025
Benzonase -4.719045 8.051585 65.30664 -0.5861014 0.5598277
Host zero -2.064590 8.208256 65.44300 -0.2515260 0.8021956
Molysis 10.862491 8.314322 65.52729 1.3064795 0.1959566
QIAamp -2.491588 8.330143 65.53938 -0.2991050 0.7658063
log10(Final_reads) 12.532134 3.113656 67.56866 4.0248934 0.0001465
Nasal swab * lyPMA 4.202591 10.419319 64.56848 0.4033461 0.6880263
Sputum * lyPMA 34.664316 10.599221 64.21630 3.2704588 0.0017287
Nasal swab * Benzonase 1.775047 10.042806 64.92798 0.1767481 0.8602564
Sputum * Benzonase 62.518484 10.357810 64.45145 6.0358784 0.0000001
Nasal swab * Host zero 4.793943 9.787379 64.68743 0.4898086 0.6259264
Sputum * Host zero 92.094185 10.559614 64.43191 8.7213589 0.0000000
Nasal swab * Molysis -7.289176 10.178107 65.15977 -0.7161623 0.4764500
Sputum * Molysis 87.197251 10.723044 64.48528 8.1317631 0.0000000
Nasal swab * QIAamp -3.726532 9.774308 64.66718 -0.3812579 0.7042616
Sputum * QIAamp 70.548734 10.400892 64.40670 6.7829501 0.0000000

Species richness - stratified (NS)

Molysis and host zero may incrased speciess richness of nasal swab

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -26.0233860 9.340261 28 -2.7861520 0.0094654
lyPMA -2.1153025 2.324810 28 -0.9098818 0.3706512
Benzonase -1.7637014 2.206148 28 -0.7994486 0.4307605
Host zero 8.8224892 2.495655 28 3.5351399 0.0014386
Molysis 4.5783098 2.217883 28 2.0642703 0.0483678
QIAamp 0.8360832 2.678401 28 0.3121575 0.7572336
log10(Final_reads) 5.6349921 1.419898 28 3.9685903 0.0004571

Species richness at BAL

No changes observed

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -100.395549 25.504894 20.38095 -3.9363249 0.0007925
lyPMA -5.839094 7.201885 17.39942 -0.8107731 0.4284473
Benzonase -10.667780 7.752673 18.42945 -1.3760131 0.1853100
Host zero -8.986746 8.091470 18.61512 -1.1106444 0.2808621
Molysis 3.342010 8.316699 18.72132 0.4018433 0.6923498
QIAamp -10.097992 8.350027 18.73603 -1.2093364 0.2415732
log10(Final_reads) 19.520813 4.535759 19.97726 4.3037587 0.0003466

Species richness at sputum

Benzonase may incrased speciess richness of sputum

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -166.73637 94.19800 20.78908 -1.770063 0.0913826
lyPMA 21.48046 13.48708 19.68002 1.592670 0.1271695
Benzonase 41.90046 17.06102 19.99448 2.455918 0.0233254
Host zero 58.57768 28.77613 20.32431 2.035634 0.0550337
Molysis 60.65374 33.57099 20.37080 1.806731 0.0855997
QIAamp 41.44599 24.96239 20.26633 1.660337 0.1122410
log10(Final_reads) 31.32813 16.05007 20.49824 1.951899 0.0647464

Shannon

All samples:

Shannon ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

Shannon ~ sample_type + log10 (Final_reads) + (1|original_sample)

ANOVA Shannon

Similar pattern with species richness

lmer_shannon <- lmer(data_shannon ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_shannon %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>%
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 3.3128614 1.6564307 2 10.63403 7.318283 0.0100286
treatment 6.1394953 1.2278991 5 64.78018 5.424986 0.0003143
log10(Final_reads) 0.9864384 0.9864384 1 67.39196 4.358187 0.0406149
sample_type * treatment 5.5684422 0.5568442 10 64.45592 2.460196 0.0147100

Shannon (all samples & interaction term)

Similar pattern with species richness

#Shannon

lmer_shannon %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.2922817 0.8090108 73.88309 2.8334375 0.0059333
Nasal swab 0.4603976 0.5653783 19.28942 0.8143179 0.4254023
Sputum 0.5529713 0.5086504 22.46019 1.0871343 0.2885070
lyPMA 0.1726922 0.3406059 64.15772 0.5070146 0.6138828
Benzonase 0.3896151 0.3431787 65.15496 1.1353129 0.2604034
Host zero 0.2374929 0.3498614 65.28972 0.6788198 0.4996513
Molysis 1.0035787 0.3543855 65.37303 2.8318841 0.0061444
QIAamp 0.4240802 0.3550603 65.38498 1.1943892 0.2366407
log10(Final_reads) -0.2771226 0.1327452 67.39196 -2.0876272 0.0406149
Nasal swab * lyPMA -0.4306354 0.4440629 64.42526 -0.9697620 0.3357919
Sputum * lyPMA 0.9407250 0.4517137 64.07751 2.0825690 0.0412874
Nasal swab * Benzonase -0.1913205 0.4280323 64.78076 -0.4469766 0.6563828
Sputum * Benzonase 1.1910341 0.4414361 64.30980 2.6980895 0.0088969
Nasal swab * Host zero 0.2059598 0.4171353 64.54324 0.4937482 0.6231601
Sputum * Host zero 1.5319202 0.4500358 64.29022 3.4039963 0.0011480
Nasal swab * Molysis -0.6557050 0.4338096 65.01040 -1.5115042 0.1355060
Sputum * Molysis 0.9831440 0.4570035 64.34280 2.1512836 0.0352141
Nasal swab * QIAamp -0.2868142 0.4165773 64.52336 -0.6885016 0.4936051
Sputum * QIAamp 1.0954341 0.4432702 64.26548 2.4712562 0.0161247

Shannon - stratified (NS)

Similar pattern with species richness

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.4705305 0.7141282 28 4.859814 0.0000407
lyPMA -0.3473353 0.1777480 28 -1.954088 0.0607452
Benzonase 0.1820609 0.1686754 28 1.079356 0.2896393
Host zero 0.5077044 0.1908103 28 2.660782 0.0127593
Molysis 0.3999084 0.1695727 28 2.358331 0.0255726
QIAamp 0.2884031 0.2047825 28 1.408339 0.1700383
log10(Final_reads) -0.3901164 0.1085611 28 -3.593519 0.0012351

Shannon - stratified (BAL)

Similar pattern with species richness

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.4183961 1.4942036 20.85902 0.2800128 0.7822258
lyPMA 0.0342340 0.4350346 17.07472 0.0786927 0.9381919
Benzonase 0.0898923 0.4638505 18.70718 0.1937959 0.8484245
Host zero -0.1102537 0.4832242 18.99852 -0.2281627 0.8219575
Molysis 0.6263137 0.4961366 19.16322 1.2623816 0.2219587
QIAamp 0.0425762 0.4980493 19.18590 0.0854859 0.9327609
log10(Final_reads) 0.0676641 0.2665336 20.80923 0.2538672 0.8020897

Shannon - stratified (Spt)

Similar pattern with species richness

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.1874597 3.2210139 20.47081 0.9895827 0.3339316
lyPMA 1.1451368 0.4583906 19.43439 2.4981682 0.0215971
Benzonase 1.6303288 0.5807036 19.65341 2.8075057 0.0109962
Host zero 1.8676909 0.9809856 19.88476 1.9038922 0.0714978
Molysis 2.1036052 1.1447010 19.91755 1.8376897 0.0810732
QIAamp 1.6026662 0.8507368 19.84393 1.8838567 0.0743222
log10(Final_reads) -0.3358544 0.5476150 20.00774 -0.6133039 0.5465850

Simpson

Inverse Simpson of all samples:

Inverse Simpson ~ sample_type * treatment + (1|original_sample)

Stratified:

Inverse Simpson ~ treatment + (1|original_sample)

Inv Simp - ANOVA

lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = sample_data)

lmer_invsimpson %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 66.71888 33.35944 2 10.09954 6.331375 0.0165144
treatment 108.79840 21.75968 5 65.57511 4.129826 0.0025391
sample_type * treatment 146.81964 14.68196 10 65.54979 2.786528 0.0062113

Simpson (all samples & interaction term)

#Simpson

lmer_invsimpson %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.7636785 1.590206 31.25786 1.7379371 0.0920661
Nasal swab -0.7517231 2.274375 19.94045 -0.3305186 0.7444557
Sputum 0.1251783 2.177249 28.40089 0.0574938 0.9545544
lyPMA -0.2135194 1.623100 65.17797 -0.1315504 0.8957452
Benzonase -0.4293400 1.557133 65.78621 -0.2757247 0.7836238
Host zero -1.0688451 1.557133 65.78621 -0.6864187 0.4948615
Molysis 2.3309395 1.557133 65.78621 1.4969432 0.1391907
QIAamp -0.7961192 1.557133 65.78621 -0.5112725 0.6108720
Nasal swab * lyPMA 0.0633029 2.059718 65.40566 0.0307338 0.9755754
Sputum * lyPMA 3.6202491 2.177618 65.17797 1.6624816 0.1012185
Nasal swab * Benzonase 0.9659709 2.008145 65.78191 0.4810265 0.6320938
Sputum * Benzonase 8.5483828 2.128903 65.50473 4.0153940 0.0001553
Nasal swab * Host zero 1.3633721 2.008145 65.78191 0.6789212 0.4995696
Sputum * Host zero 7.2535197 2.128903 65.50473 3.4071637 0.0011264
Nasal swab * Molysis -1.5517967 2.008145 65.78191 -0.7727514 0.4424373
Sputum * Molysis 4.3813408 2.128903 65.50473 2.0580278 0.0435704
Nasal swab * QIAamp 0.4532215 2.008145 65.78191 0.2256916 0.8221410
Sputum * QIAamp 5.6968238 2.128903 65.50473 2.6759438 0.0094040

Inverse Simpson - stratified (NS)

Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)

lmer(data_invsimpson ~ treatment + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.1261254 0.2740225 29 7.7589453 0.0000000
lyPMA -0.2057118 0.4746209 29 -0.4334234 0.6679143
Benzonase 0.4811356 0.4746209 29 1.0137263 0.3190970
Host zero 0.2390318 0.4746209 29 0.5036268 0.6183281
Molysis 0.8346381 0.4746209 29 1.7585366 0.0892065
QIAamp -0.2874024 0.4746209 29 -0.6055411 0.5495297

Inverse Simpson - stratified (BAL)

lmer(data_invsimpson ~ treatment + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.3496371 1.416668 22 1.6585655 0.1113944
lyPMA -0.2135194 2.003471 22 -0.1065747 0.9160922
Benzonase -0.0152987 1.900660 22 -0.0080491 0.9936503
Host zero -0.6548037 1.900660 22 -0.3445139 0.7337314
Molysis 2.7449809 1.900660 22 1.4442252 0.1627674
QIAamp -0.3820778 1.900660 22 -0.2010238 0.8425269

Inverse Simpson - stratified (spt)

lmer(data_invsimpson ~ treatment + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.888857 2.105591 8.714033 1.371993 0.2043471
lyPMA 3.406730 1.901306 20.000000 1.791784 0.0883118
Benzonase 8.119043 1.901306 20.000000 4.270245 0.0003740
Host zero 6.184675 1.901306 20.000000 3.252856 0.0039845
Molysis 6.712280 1.901306 20.000000 3.530352 0.0021020
QIAamp 4.900705 1.901306 20.000000 2.577546 0.0179780

Burger-Parker index

BPI of all samples:

BPI ~ sample_type + treatment + log10 (Final_reads) + (1|original_sample)

BPI stratified:

BPI ~ treatment + log10 (Final_reads) + (1|original_sample)

BPI -ANOVA

lmer_dbp <- lmer(dbp ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)

lmer_dbp %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.2829103 0.1414552 2 10.48239 4.591210 0.0369787
treatment 0.9164438 0.1832888 5 64.73516 5.949003 0.0001381
log10(Final_reads) 0.3982546 0.3982546 1 68.06101 12.926151 0.0006086
sample_type * treatment 0.4601290 0.0460129 10 64.32239 1.493441 0.1624116

BPI - all samples & interaction term

#BPI
lmer_dbp <- lmer(dbp ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_dbp %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html", caption = "Without interaction term") %>%
        kable_styling(full_width = 0, html_font = "serif")
Without interaction term
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -0.2035448 0.2934875 73.98607 -0.6935381 0.4901436
Nasal swab -0.2746027 0.1918494 21.39063 -1.4313447 0.1667807
Sputum -0.1751176 0.1739103 25.68997 -1.0069421 0.3233533
lyPMA -0.1112415 0.1256539 63.94756 -0.8853014 0.3793110
Benzonase -0.2369262 0.1264658 65.21502 -1.8734412 0.0654896
Host zero -0.1391248 0.1289092 65.38675 -1.0792465 0.2844429
Molysis -0.4135086 0.1305640 65.49290 -3.1670959 0.0023387
QIAamp -0.2351370 0.1308108 65.50813 -1.7975344 0.0768596
log10(Final_reads) 0.1754176 0.0487909 68.06101 3.5952957 0.0006086
Nasal swab * lyPMA 0.1304024 0.1637733 64.28956 0.7962372 0.4288246
Sputum * lyPMA -0.2040959 0.1666573 63.84450 -1.2246442 0.2252088
Nasal swab * Benzonase 0.1025637 0.1578004 64.73833 0.6499583 0.5180190
Sputum * Benzonase -0.2217425 0.1628249 64.14039 -1.3618465 0.1780105
Nasal swab * Host zero -0.0772609 0.1538230 64.43370 -0.5022718 0.6171886
Sputum * Host zero -0.4529917 0.1660001 64.11913 -2.7288631 0.0081927
Nasal swab * Molysis 0.2090844 0.1598905 65.02325 1.3076721 0.1955892
Sputum * Molysis -0.2825321 0.1685606 64.18786 -1.6761456 0.0985744
Nasal swab * QIAamp 0.1125011 0.1536206 64.40678 0.7323308 0.4666234
Sputum * QIAamp -0.2947874 0.1635091 64.08537 -1.8028812 0.0761087
#BPI
lmer_dbp %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html", caption = "With interaction term") %>%
        kable_styling(full_width = 0, html_font = "serif")
With interaction term
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -0.2035448 0.2934875 73.98607 -0.6935381 0.4901436
Nasal swab -0.2746027 0.1918494 21.39063 -1.4313447 0.1667807
Sputum -0.1751176 0.1739103 25.68997 -1.0069421 0.3233533
lyPMA -0.1112415 0.1256539 63.94756 -0.8853014 0.3793110
Benzonase -0.2369262 0.1264658 65.21502 -1.8734412 0.0654896
Host zero -0.1391248 0.1289092 65.38675 -1.0792465 0.2844429
Molysis -0.4135086 0.1305640 65.49290 -3.1670959 0.0023387
QIAamp -0.2351370 0.1308108 65.50813 -1.7975344 0.0768596
log10(Final_reads) 0.1754176 0.0487909 68.06101 3.5952957 0.0006086
Nasal swab * lyPMA 0.1304024 0.1637733 64.28956 0.7962372 0.4288246
Sputum * lyPMA -0.2040959 0.1666573 63.84450 -1.2246442 0.2252088
Nasal swab * Benzonase 0.1025637 0.1578004 64.73833 0.6499583 0.5180190
Sputum * Benzonase -0.2217425 0.1628249 64.14039 -1.3618465 0.1780105
Nasal swab * Host zero -0.0772609 0.1538230 64.43370 -0.5022718 0.6171886
Sputum * Host zero -0.4529917 0.1660001 64.11913 -2.7288631 0.0081927
Nasal swab * Molysis 0.2090844 0.1598905 65.02325 1.3076721 0.1955892
Sputum * Molysis -0.2825321 0.1685606 64.18786 -1.6761456 0.0985744
Nasal swab * QIAamp 0.1125011 0.1536206 64.40678 0.7323308 0.4666234
Sputum * QIAamp -0.2947874 0.1635091 64.08537 -1.8028812 0.0761087

BPI - stratified (NS)

BPI ~ sample_type + log10 (Final_reads) + (1|original_sample)

lmer(dbp ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
x Estimate Std. Error df t value Pr(>|t|)
(Intercept) -0.8393907 0.3486121 27.99482 -2.4078071 0.0228843
lyPMA 0.0510698 0.0805909 25.70455 0.6336915 0.5318810
Benzonase -0.1377638 0.0764151 25.81764 -1.8028343 0.0831000
Host zero -0.2586338 0.0873090 26.15521 -2.9622802 0.0064248
Molysis -0.2183024 0.0767411 25.72237 -2.8446604 0.0086028
QIAamp -0.1843464 0.0934638 25.80702 -1.9723826 0.0593709
log10(Final_reads) 0.2299546 0.0508548 26.50782 4.5217915 0.0001142

BPI - stratified (BAL)

lmer(dbp ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
x Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.3043331 0.5301349 20.47181 0.5740673 0.5721764
lyPMA -0.0735485 0.1510204 17.01214 -0.4870103 0.6324666
Benzonase -0.1530734 0.1621430 18.28749 -0.9440639 0.3574437
Host zero -0.0421982 0.1691435 18.51848 -0.2494819 0.8057344
Molysis -0.3085461 0.1738005 18.65049 -1.7752889 0.0921718
QIAamp -0.1290205 0.1744898 18.66877 -0.7394156 0.4688510
log10(Final_reads) 0.0815548 0.0944078 20.17656 0.8638567 0.3978202

BPI - stratified (spt)

lmer(dbp ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
x Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.0286770 1.1254512 20.89128 0.0254805 0.9799136
lyPMA -0.2775806 0.1614755 19.71412 -1.7190265 0.1012737
Benzonase -0.3995336 0.2041454 20.07039 -1.9571028 0.0643899
Host zero -0.4751333 0.3441062 20.44288 -1.3807754 0.1822586
Molysis -0.5569118 0.4014067 20.49523 -1.3874001 0.1802223
QIAamp -0.4309461 0.2985350 20.37752 -1.4435365 0.1640679
log10(Final_reads) 0.1055072 0.1918616 20.63853 0.5499133 0.5882813

*** Results: ***

3.1. Species richness - type * method specific. Sputum showed the highest changes, in every methods

3.2. Stratified analysis showed that some methods increased some alpha diversity indices. Changes were highest at sputum. However, stratified analysis showed Benzonase was the only one showed significant changes.

A4. Taxa beta diversity

Permanova (Taxa dist ~ log10(final reads) + sample_type + treatment + sample_type * treatment + subject_id) –> both stratified and nonstratified

Beta diversity figures

PCoA based on Bray-Curtis dissimilarities

phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_rel, S.obs != 0 & sample_type %in% c("BAL", "Nasal swab", "Sputum"))

bray_perm_uni <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment + subject_id,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), permutations = 10000)


bray_perm_uni_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_inter <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type * treatment + log10(Final_reads),
                                  data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), 
                                  strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


bray_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                               data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab") %>%
                                       sample_data %>% data.frame(check.names = F),
                               strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab") %>% 
                                       sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_bal  <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="bray") ~  lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                 data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>% sample_data %>% data.frame(check.names = F),
                                 strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
                                         sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum") %>% sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
                                %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


ordinate(phyloseq_rel_nz,  method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_rel_nz, ., col = "treatment", shape = "sample_type" ) + 
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_shape(name = "Sample type", labels = c("BAL", "Nasal swab", "Sputum")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm")) + #legend.position = c(0.9, 0.4)
        labs(tag = "E")

Beta diversity boxplot

Distances between samples within each subject. Mean distance between control <-> treatment for each subject

#distances of betadiversity - boxplots
bray_dist_long <- distance(phyloseq_rel_nz, method="bray") %>% as.matrix() %>% melt_dist() #making long data of distance matrices
#Adding sample type and treatment name. 
#this can be also done by merging metadata into the `bray_dist_long`
names <- data.frame(str_split_fixed(bray_dist_long$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(bray_dist_long$iso2, "_", 3))
bray_dist_long$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
bray_dist_long$method_1 <- ifelse(grepl("control", bray_dist_long$iso1),"control", 
                                  ifelse(grepl("lyPMA", bray_dist_long$iso1),"lypma", 
                                         ifelse(grepl("benzonase", bray_dist_long$iso1),"benzonase", 
                                                ifelse(grepl("host", bray_dist_long$iso1),"host_zero", 
                                                       ifelse(grepl("qia", bray_dist_long$iso1),"qiaamp", 
                                                              ifelse(grepl("moly", bray_dist_long$iso1),"molysis", 
                                                                     NA))))))
#Adding data for iso 2 also should be done
bray_dist_long$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
bray_dist_long$method_2 <-ifelse(grepl("control", bray_dist_long$iso2),"control", 
                                 ifelse(grepl("lyPMA", bray_dist_long$iso2),"lypma", 
                                        ifelse(grepl("benzonase", bray_dist_long$iso2),"benzonase", 
                                               ifelse(grepl("host", bray_dist_long$iso2),"host_zero", 
                                                      ifelse(grepl("qia", bray_dist_long$iso2),"qiaamp", 
                                                             ifelse(grepl("moly", bray_dist_long$iso2),"molysis", 
                                                                    NA))))))
#subsetting distances of my interest
path_bray_dist_long_within_sampleid <- subset(bray_dist_long, bray_dist_long$sample_id_1 == bray_dist_long$sample_id_2)
path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid, path_bray_dist_long_within_sampleid$method_1 == "control" | path_bray_dist_long_within_sampleid$method_2 == "control" )
path_bray_dist_long_within_sampleid_from_control$treatment <- path_bray_dist_long_within_sampleid_from_control$method_1
path_bray_dist_long_within_sampleid_from_control$treatment <- ifelse(path_bray_dist_long_within_sampleid_from_control$treatment == "control", path_bray_dist_long_within_sampleid_from_control$method_2, path_bray_dist_long_within_sampleid_from_control$treatment)
path_bray_dist_long_within_sampleid_from_control$sample_type <- ifelse(grepl("NS", path_bray_dist_long_within_sampleid_from_control$iso1), "nasal_swab",
                                                                  ifelse(grepl("CFB", path_bray_dist_long_within_sampleid_from_control$iso1), "Sputum",
                                                                         ifelse(grepl("BAL", path_bray_dist_long_within_sampleid_from_control$iso1), "BAL", NA)))

label <-  c("BAL","Nasal swab","Sputum")
names(label) <- c("BAL","nasal_swab","Sputum")

ggplot(path_bray_dist_long_within_sampleid_from_control, aes(y = dist, fill = treatment)) +
        geom_boxplot() +
        #scale_fill_manual(values = c(viridis(6)[2:6])) +
        scale_fill_manual(values = c("#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Sample pair distances") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "F") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, labeller = labeller(sample_type = label))

PERMANOVA test results

Subject as fixed effect vs strata term

Subject as fixed effect

adonis (dist ~ sample_type + log10(Final_reads) + treated + subject)

With strata

adonis (dist ~ sample_type + log10(Final_reads) + treated, strata = subject)

Strata term was employed rather than fixed effect

bray_perm_uni %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as fixed effect") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as fixed effect
Df SumOfSqs R2 F Pr(>F)
Sample type 2 13.429 0.371 56.251 0.000
log10(Final reads) 1 1.111 0.031 9.310 0.000
Treatment 5 1.213 0.033 2.033 0.001
Subject 10 11.639 0.321 9.751 0.000
Residual 74 8.833 0.244 NA NA
Total 92 36.225 1.000 NA NA
bray_perm_uni_strata %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as strata term") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as strata term
Df SumOfSqs R2 F Pr(>F)
Sample type 2 13.429 0.371 27.550 0.000
log10(Final reads) 1 1.111 0.031 4.560 0.000
Treatment 5 1.213 0.033 0.996 0.025
Residual 84 20.472 0.565 NA NA
Total 92 36.225 1.000 NA NA

Treatment significantly affected the beta-diversity.

Strata term is better representing our study aim.

What type of method affected the community at the most?

PERMANOVA on each treatment

dist ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp, strata = subject

QIAamp showed highest changes. But, it could be sample type specific.

bray_perm_strata %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 2 13.429 0.371 27.550 0.000
log10(Final reads) 1 1.111 0.031 4.560 0.000
lyPMA 1 0.140 0.004 0.574 0.521
Benzonase 1 0.092 0.003 0.379 0.663
Host zero 1 0.157 0.004 0.643 0.408
Molysis 1 0.200 0.006 0.821 0.167
QIAamp 1 0.624 0.017 2.561 0.003
Residual 84 20.472 0.565 NA NA
Total 92 36.225 1.000 NA NA

PERMANOVA with interaction term

dist ~ sample_type * treatment + log10(Final_reads), strata = subject

It was sample type specific. We need stratified analysis

bray_perm_inter %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "sample_type:treatment" ~ 'Sample type * treatment',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 2 13.429 0.371 27.577 0
Treatment 5 1.184 0.033 0.973 0
log10(Final reads) 1 1.140 0.031 4.683 0
Sample type * treatment 10 2.455 0.068 1.008 0
Residual 74 18.017 0.497 NA NA
Total 92 36.225 1.000 NA NA

Stratified (nasal swab)

lyPMA affected nasal swab samples.

bray_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.730 0.120 4.873 0.002
Benzonase 1 0.191 0.031 1.277 0.312
Host zero 1 0.171 0.028 1.143 0.374
Molysis 1 0.137 0.022 0.914 0.373
QIAamp 1 0.254 0.042 1.694 0.142
log10(Final reads) 1 0.428 0.070 2.861 0.031
Residual 28 4.192 0.687 NA NA
Total 34 6.103 1.000 NA NA

Stratified (BAL)

QIAamp affected BAL samples

bray_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.100 0.010 0.272 0.325
Benzonase 1 0.025 0.003 0.068 0.986
Host zero 1 0.086 0.009 0.235 0.518
Molysis 1 0.085 0.009 0.230 0.569
QIAamp 1 0.229 0.024 0.623 0.004
log10(Final reads) 1 1.482 0.152 4.028 0.021
Residual 21 7.726 0.794 NA NA
Total 27 9.734 1.000 NA NA

Stratified (spt)

Sputum was affected by Molysis and QIAamp.

bray_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.139 0.020 0.633 0.194
Benzonase 1 0.037 0.005 0.170 0.765
Host zero 1 0.171 0.025 0.777 0.135
Molysis 1 0.436 0.063 1.985 0.012
QIAamp 1 0.953 0.137 4.339 0.000
log10(Final reads) 1 0.172 0.025 0.783 0.357
Residual 23 5.052 0.726 NA NA
Total 29 6.960 1.000 NA NA

Results:

4.1. Effect of each treatment on beta-diveristy was sample type specific.

4.2. NS showed no significant change by QIAamp method

4.3. Sputum showed high change after Molysis and QIAamp. However, here, high change may be meaning higher (better) detection efficiencies. Therefore further analysis is required.

Intermediate results

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c("No increase in final reads",
                                                     "No increase in final reads",
                                                     "No increase in final reads"),
                                           Benzonase = c("No decrease in host %",
                                                         "No decrease in host %",
                                                         "No decrease in host %"),
                                           `Host zero` = c(NA,
                                                           NA,
                                                           NA),
                                           Molysis = c("No decrease in host %",
                                                       "High cahnge of failure in library pep",
                                                       NA),
                                           QIAamp = c("No decrease in host %",
                                                      NA,
                                                      "No decrease in host %")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of issues of each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of issues of each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL No increase in final reads No decrease in host % NA No decrease in host % No decrease in host %
Nasal swab No increase in final reads No decrease in host % NA High cahnge of failure in library pep NA
Sputum No increase in final reads No decrease in host % NA NA No decrease in host %
matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c(NA,
                                                           "Beta changed",
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Richness + Shannon + InvSimp +"),
                                           `Host zero` = c(NA,
                                                           "Richness + Shannon + InvSimp + BPI -",
                                                           NA),
                                           Molysis = c(NA,
                                                           "Richness + Shannon + InvSimp + BPI -",
                                                           "Beta changed"),
                                           QIAamp = c("Beta changed",
                                                           NA,
                                                           "Beta  changed")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of community changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of community changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA Beta changed
Nasal swab Beta changed NA Richness + Shannon + InvSimp + BPI - Richness + Shannon + InvSimp + BPI - NA
Sputum Shannon + Richness + Shannon + InvSimp + NA Beta changed Beta changed

Some methods were successful in increasing final reads and lowering host DNA%.

We have no idea weather some changes in diversities are due to deeper sequencing or contaminants

Further anlyses on individual taxa are required

A5. DA analysis for taxa, by sample type and treatment

Hypothesis: if a taxon is a contaminant induced by a treatment method, its DA analysis result should be associated with treatment covariate.

Both stratified and nonstratified were conducted.

Looked at other level groups as well - family and genus

Without interaction

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

With interaction

feature ~ log10(final reads) + sample type + treatment + sample type * treatment + (1|subject)

Stratified

feature ~ log10(final reads) + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

MaAsLin settings : log transform, total sum scaling normalization

Results

#DA analysis - MaAslin
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)

#Running MaAslin for all sample without decontam
#for taxa differentially abundant by host depletion method, look to see which ones overlap with potential contaminant taxa

# Maaslin - # # y ~ log(final reads) + sample_type + treatment  -----------

#all samples

MaAslin - volcano plot

Without interaction

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

Most samples are differentially abundant by sample type

#Making significance table for figure
        # Define a function to make species names italicized
species_italic <- function(data) {
  names <- gsub("_", " ", rownames(data))
  names <- gsub("[]]|[[]", "", names)
  names <- gsub(" sp", " sp.", names)
  names <- gsub(" sp.", "* sp.", names)
  names <- gsub(" group", "* group.", names)
  names <- ifelse(grepl("[*]", names), paste("*", names, sep = ""), paste("*", names, "*", sep = ""))
  rownames(data) <- names
  data
}

# Make a significance table for each figure (top 20 taxa)
make_sig_table <- function(data) {
  sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
  sig_data$min <- apply(sig_data, 1, FUN = min)
  sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
  sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
  sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>% column_to_rownames(var = "feature") %>% species_italic %>% select(-c("-"))
  sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame %>% 
          rename(lyPMA = lypma,  Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
  return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}
fit_data_bal <- make_sig_table(fit_data_bal)
fit_data_ns <- make_sig_table(fit_data_ns)
fit_data_spt <- make_sig_table(fit_data_spt)

spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Sputum")) %in% fit_data_spt$data$feature)
fit_data_spt$rel <- cbind(spt_sig %>% otu_table %>% t, spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_spt$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

ns_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab")) %in% fit_data_ns$data$feature)
fit_data_ns$rel <- cbind(ns_sig %>% otu_table %>% t, ns_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
        .[row.names(fit_data_ns$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

bal_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "BAL")) %in% fit_data_bal$data$feature)
fit_data_bal$rel <- cbind(bal_sig %>% otu_table %>% t, bal_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_bal$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

#Volcano plot

ggplot(maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
        theme_classic(base_family = "serif") +
        labs(tag = "A") +
        geom_point(size = 2) +
        xlab("MaAslin coefficient") +
        ylab("-log<sub>10</sub>(*q*-value)") +
        geom_hline(yintercept = 1, col = "gray") +
        geom_vline(xintercept = 0, col = "gray") +
        geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
        theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
        scale_color_manual(values = c("#4daf4a",  "#984ea3", "#f781bf", "#377eb8", "#ff7f00", "#ffff33", "#a65628"),
                           breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero",  "molysis", "qiaamp"), 
                           labels = c("log10 (Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero",  "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))

MaAslin - table

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

Some taxa were changed due to treatment

Stratified analysis is required.

Table of associations had significant (q < 0.1) result

maaslin_all %>% subset(., .$qval < 0.1) %>% arrange(., .$feature) %>% .$metadata %>% table
## .
##         benzonase         host_zero log10.Final_reads             lypma 
##                 8                 6                50                14 
##           molysis            qiaamp       sample_type 
##                 9                 2                66

MaAslin - can’t test global significance of a covariate with multi level.

(https://forum.biobakery.org/t/global-significance-test-for-multilevel-factor/3061)

Baloon plot - BAL

Mean relative abundances of top 20 taxa had low q-values.

No taxa changed after treatment

ggballoonplot(fit_data_bal$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top", axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              axis.text.y = ggtext::element_markdown())  +
        geom_text(data = fit_data_bal$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Baloon plot - nasal swabs

Mean relative abundances of top 20 taxa had low q-values.

Some taxa changed after treatment, but nothing was unique

ggballoonplot(fit_data_ns$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "B") +
        theme(panel.grid.major = element_line(colour = "grey"), axis.text.x = element_text(vjust = 0.5, hjust=0.5), axis.text.y = ggtext::element_markdown(), legend.position = "top")  +
        geom_text(data = fit_data_ns$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Baloon plot - Sputum

Mean relative abundances of top 20 taxa had low q-values.

Some taxa changed after treatment, but nothing was unique

ggballoonplot(fit_data_spt$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "C") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top", axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              axis.text.y = ggtext::element_markdown())  +
        geom_text(data = fit_data_spt$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Results

Some taxa were significantly changed after treatment. Among top 20, no taxa observed in only one treatment group. As their emergence were consistent across all treatment groups, they were considered as endogenus.

MaAslin with interaction

feature ~ log10(Final reads) + treatment + sample type + treatment * sample type (1|subject)

Some taxa were treaetment specific, after adjusting interaction of sample type * treatment

#Generating interaction term
sample_data(phyloseq_rel_nz)$sampletype_treatment <- paste(sample_data(phyloseq_rel_nz)$sample_type, sample_data(phyloseq_rel_nz)$treatment, sep = "*")
#interaction term - ggplot
ggplot(maaslin_interaction, aes(y = -log10(qval), x = coef, col = metadata)) +
         theme_classic(base_family = "serif") +
         #labs(tag = "A") +
         ggtitle("MaAslin with interaction term")+
         geom_point(size = 2) +
         xlab("MaAslin coefficient") +
         ylab("-log<sub>10</sub>(*q*-value)") +
         geom_hline(yintercept = 1, col = "gray") +
         geom_vline(xintercept = 0, col = "gray") +
         geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
         theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
         scale_color_manual(values = c("#e41a1c",  "#377eb8", "#4daf4a", "#984ea3")) +
         guides(col = guide_legend(title = "Fixed effects", title.position = "top", nrow = 1))

#Checking number of bugs differentially abundance with interaction term 
cat("Number of differentially abundant bugs by each metadata")
## Number of differentially abundant bugs by each metadata
maaslin_interaction %>% subset(., .$qval < 0.1) %>% .$metadata %>% table()
## .
##    log10.Final_reads          sample_type sampletype_treatment 
##                    6                   73                  172 
##            treatment 
##                   24

MaAsLin interaction analysis

Hypothesis: if a sample is contaminated by some treatment, a change of taxon is likely to be associated with one treatment method

No taxa increased only because of one treatment method

 cat("Some taxa were increased by each treatmment.\n But they are not contaminants, \nif they are present in most of the treatments")
## Some taxa were increased by each treatmment.
##  But they are not contaminants, 
## if they are present in most of the treatments
 maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>% .$feature %>% table %>% data.frame %>% arrange(-Freq) %>% rename(Feature = ".") %>% kbl(format = "html", caption = "Table of taxa differentially abundant by treatment") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa differentially abundant by treatment
Feature Freq
Candida_albicans 5
Candida_dubliniensis 5
Candida_parapsilosis 5
Cupriavidus_sp 3
Sutterella_parvirubra 3
Gemella_haemolysans 2
Peptostreptococcus_stomatis 1
 cat("Most of taxa were found on most of treatments.")
## Most of taxa were found on most of treatments.
 cat("Some taxa were treatment specific, only to one group")
## Some taxa were treatment specific, only to one group
subset(maaslin_interaction, maaslin_interaction$feature %in%  (maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>%
         .$feature %>% table %>% data.frame %>% subset(., Freq == 1) %>% .$. %>% as.character())) %>% subset(., .$qval < 0.1) %>% select(c("feature", "metadata", "value", "coef", "qval")) %>% remove_rownames() %>% kbl(format = "html", caption = "Table of taxa specific to one treatment group") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa specific to one treatment group
feature metadata value coef qval
Peptostreptococcus_stomatis treatment lyPMA -3.024443 0.0281414

No taxa was increased due to one treatmemnt.

MaAslin at Family level

Some family were treatment specific.

ggplot(maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
        theme_classic(base_family = "serif") +
        labs(tag = "A") +
        geom_point(size = 2) +
        xlab("MaAslin coefficient") +
        ylab("-log<sub>10</sub>(*q*-value)") +
        geom_hline(yintercept = 1, col = "gray") +
        geom_vline(xintercept = 0, col = "gray") +
        geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
        theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
        scale_color_manual(values = c("#4daf4a",  "#984ea3", "#e41a1c"),
                           breaks = c("log10.Final_reads", "sample_type", "treatment"), 
                           labels = c("log10 (Final reads)", "Sample type", "Treatment")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))

MaAslin family - table

feature ~ log10(final reads) + treatment (categorical)

Table of associations had significant (q < 0.1) result

Molysis may induced contaminants (Streptococcaceae)

maaslin_all %>% subset(., .$qval < 0.1) %>% arrange(., .$feature) %>% .$metadata %>% table
## .
## log10.Final_reads       sample_type         treatment 
##                19                25                24
subset(maaslin_all, maaslin_all$feature %in%  (maaslin_all %>% subset(., .$qval < 0.1) %>%
         .$feature %>% table %>% data.frame %>% subset(., Freq == 1) %>% .$. %>% as.character())) %>% subset(., .$qval < 0.1) %>% select(c("feature", "metadata", "value", "coef", "qval")) %>% subset(.,.$metadata == "treatment") %>% 
        remove_rownames() %>% kbl(format = "html", caption = "Table of family specific to one treatment group") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of family specific to one treatment group
feature metadata value coef qval
Streptococcaceae treatment Molysis 2.092584 0.0486049

A5 Results:

5.1. Both non-stratified and stratified analysis showed that there were no potential contaminants at species level.

5.2. Molysis may inducted 1 potential contaminants (Streptococcaceae), at family level

5.3. After adding control data, MaAslin needs to be reanalyzed. Adding controls (mock communities) for each treatment group will show more statistically valid results in y ~ log(final reads) + sample_type + treatment, (re = subject_id))

A6. Decontam - stratified by treatment

input of DNA concentration: 16S qPCR data

https://github.com/benjjneb/decontam/issues/33

Ben Callahan: But in the more limited testing on qPCR data the method still seems to work, and other publications report strong patterns of inverse frequency of contaminants using qPCR data - which is the pattern the frequency method relies on.

Both stratified and nonstratified

Strategy:

2.4.1. run decontam for all samples (common contaminants, by extraction)

2.4.2. stratify decontam analysis per each treatment method (contaminants by depletion methods)

Results

decontam - all sample

Listeria floridensis could be a potential contaminant

# Decontam package --------------------------------------------------------

# common contaminants across all the treatment methods
#Decontam - were there any contaminants?#

sample_data(phyloseq$phyloseq_rel)$is.neg <- grepl("Neg. ext.", sample_data(phyloseq$phyloseq_rel)$sample_type)
phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_rel, S.obs != 0)

#With all sampels
dec_f_all <- isContaminant(phyloseq_rel_nz, method="frequency", conc="DNA_bac_well")
dec_p_all <- isContaminant(phyloseq_rel_nz, method="prevalence", neg="is.neg", threshold=0.5)
dec_c_all <- isContaminant(phyloseq_rel_nz, method="combined", neg="is.neg", conc = "DNA_bac_well")

cat("decontam frequency - all sample")
## decontam frequency - all sample
dec_f_all %>% subset(.,.$contaminant)
cat("decontam prevalence - all sample")
## decontam prevalence - all sample
dec_p_all %>% subset(.,.$contaminant)
cat("decontam combined - all sample")
## decontam combined - all sample
dec_c_all %>% subset(.,.$contaminant)

decontam - stratified

Stratified analysis showed no contaminants in NS and BAL

Sputum may have Corynebacterium pseudodiphtheriticum and Candida albicans as contaminants.

#Stratified by sample type

cat("decontam prevalence - BAL")
## decontam prevalence - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg. ext.")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - Nasal swab")
## decontam prevalence - Nasal swab
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal swab", "Neg. ext.")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - Sputum")
## decontam prevalence - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg. ext.")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam frequency - BAL")
## decontam frequency - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg. ext.")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Nasal swab")
## decontam frequency - Nasal swab
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal swab", "Neg. ext.")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Sputum")
## decontam frequency - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg. ext.")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - BAL")
## decontam combined - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg. ext.")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Nasal swab")
## decontam combined - Nasal swab
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal swab", "Neg. ext.")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Sputum")
## decontam combined - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg. ext.")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)

A6 Results:

6.1. Listeria floridensis could be a potential contaminant

6.2. Else, BAL and NS are free from contaminants, and sputum may have Corynebacterium pseudodiphtheriticum and Candida albicans as contaminants.

Further analysis is required after adding data of controls.

A7. LM of function alpha diversity

sample_data <- sample_data(phyloseq$phyloseq_path_rpkm) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_path_rpkm, S.obs != 0 & sample_type %in% c("BAL", "Nasal swab", "Sputum"))
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)
sample_data(phyloseq_rel_nz)$sampletype_treatment <- paste(sample_data(phyloseq_rel_nz)$sample_type, sample_data(phyloseq_rel_nz)$treatment, sep = ":")

Figure - Alpha diversity

Alpha diversity of functional analysis reult showed similar pattern with taxa result.

Similar approach was employed.

f4a <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = S.obs)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Species richness") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))


f4b <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = data_shannon)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Shannon") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "B") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4c <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = data_invsimpson)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Inverse simpson") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "C") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4d <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal swab", "BAL")), aes(y = dbp)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Berger-Parker index") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "D") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

ggarrange(f4a, f4b, f4c, f4d, common.legend = T, align = "hv") # alpha diversity plots

Function richness

Alpha diversity chould be having changes due to treatment.

Both stratified and nonstratified analyses were conducted.

All samples:

S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

S.obs ~ sample_type + log10 (Final_reads) + (1|original_sample)

Function richness (all samples & interaction term) - ANOVA

Interaction term showed high p values. However, it could be due to even effect sample type * treatment. Interaction term will be tested.

sample_data <- sample_data(phyloseq$phyloseq_path_rpkm) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
lmer_sob <- lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_sob %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 31544.55 15772.274 2 10.06740 10.123058 0.0038929
treatment 33352.07 6670.414 5 64.14678 4.281246 0.0020153
log10(Final_reads) 35804.04 35804.040 1 70.54314 22.979969 0.0000088
sample_type * treatment 29095.43 2909.543 10 63.26761 1.867421 0.0667651

Function richness (all samples & interaction term)

Effect of some treatment was neutralized by interactin term. Therefore, the association was sample_type specific.

Stratified analysis will be conducted.

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -271.32533 63.94609 72.47916 -4.2430322 0.0000643
Nasal swab 66.71179 39.44578 32.64336 1.6912273 0.1003209
Sputum 142.80087 36.31835 41.80438 3.9319203 0.0003111
lyPMA 59.31768 31.18858 64.39962 1.9019036 0.0616583
Benzonase 96.59262 31.65626 66.97650 3.0512965 0.0032643
Host zero 129.84121 32.25492 67.36194 4.0254694 0.0001466
Molysis 150.32258 32.65321 67.58822 4.6036081 0.0000189
QIAamp 86.27368 32.71224 67.61999 2.6373516 0.0103578
log10(Final_reads) 52.77944 11.01007 70.54314 4.7937427 0.0000088
Nasal swab * lyPMA -23.81209 39.44179 64.87664 -0.6037273 0.5481301
Sputum * lyPMA -5.02253 39.40167 62.83470 -0.1274700 0.8989755
Nasal swab * Benzonase -80.25929 38.01990 65.40988 -2.1109811 0.0386006
Sputum * Benzonase -52.43718 38.65224 63.57018 -1.3566400 0.1796940
Nasal swab * Host zero -106.90271 36.81521 64.23327 -2.9037647 0.0050492
Sputum * Host zero -76.35866 38.92443 62.75019 -1.9617158 0.0542352
Nasal swab * Molysis -106.97855 38.54815 66.00383 -2.7751931 0.0071693
Sputum * Molysis -108.15918 39.34418 62.63915 -2.7490513 0.0077999
Nasal swab * QIAamp -75.62038 36.68939 64.06591 -2.0610968 0.0433576
Sputum * QIAamp -40.59820 38.59881 63.08680 -1.0517993 0.2969039

All terms were significant.

Function richness - stratified (NS)

Some treatment enabled discovering more functions in nasal swabs

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 68.35891 68.76371 28 0.9941131 0.3286817
lyPMA 13.88909 17.11543 28 0.8114955 0.4239262
Benzonase 20.95703 16.24183 28 1.2903126 0.2074965
Host zero 56.43052 18.37320 28 3.0713495 0.0047048
Molysis 51.56131 16.32822 28 3.1578029 0.0037880
QIAamp 54.41631 19.71859 28 2.7596444 0.0100871
log10(Final_reads) 12.25116 10.45340 28 1.1719787 0.2510816

Function richness (BAL)

Higher Final reads enables more discovery of functions.

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -640.83129 99.14027 19.00295 -6.4638848 0.0000034
lyPMA 14.46191 33.76511 18.36963 0.4283092 0.6734060
Benzonase 16.49562 35.65062 19.98741 0.4627021 0.6485757
Host zero 39.81066 36.93641 19.98154 1.0778162 0.2939563
Molysis 54.18626 37.78713 19.92262 1.4339874 0.1670859
QIAamp -10.73947 37.91284 19.91125 -0.2832674 0.7798961
log10(Final_reads) 124.09731 17.94627 17.34430 6.9149360 0.0000022

Function richness (sputum)

Sputum showed no changes. This may due to an enrichment of richness in control groups.

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -127.58303 198.98655 21.38568 -0.6411641 0.5282282
lyPMA 54.38241 28.77622 19.88708 1.8898385 0.0734460
Benzonase 44.29211 36.29072 20.41549 1.2204804 0.2361930
Host zero 53.75292 61.00733 20.95720 0.8810895 0.3882627
Molysis 42.48496 71.13867 21.03201 0.5972132 0.5567405
QIAamp 45.90423 52.95334 20.86328 0.8668807 0.3958683
log10(Final_reads) 52.61786 33.96577 21.23460 1.5491436 0.1361234

Shannon function

All samples:

Shannon ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

Shannon ~ sample_type + log10 (Final_reads) + (1|original_sample)

ANOVA Shannon

p - value = 0.059 for the interaction term. Interaction term will be tested.

lmer_shannon <- lmer(data_shannon ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)

lmer_shannon %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.1823346 0.0911673 2 9.000626 6.3262277 0.0192428
treatment 0.2025353 0.0405071 5 64.449929 2.8108418 0.0233602
log10(Final_reads) 0.0012697 0.0012697 1 72.892849 0.0881036 0.7674461
sample_type * treatment 0.2764134 0.0276413 10 63.238098 1.9180715 0.0589214

Shannon (all samples & interaction term)

Nasal swab showed some sample type * treatment specific effect. Stratified analysis will be conducted.

#Shannon

lmer_shannon %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.5913112 0.1886495 72.61690 3.1344432 0.0024844
Nasal swab 0.3690633 0.1053813 38.98621 3.5021710 0.0011737
Sputum 0.2993038 0.0997287 55.53917 3.0011805 0.0040232
lyPMA 0.2036799 0.0942992 65.15176 2.1599339 0.0344627
Benzonase 0.0994949 0.0949856 68.98341 1.0474738 0.2985374
Host zero 0.2573740 0.0966602 69.52729 2.6626671 0.0096242
Molysis 0.3398170 0.0977799 69.83854 3.4753267 0.0008813
QIAamp 0.0878239 0.0979461 69.88172 0.8966555 0.3729814
log10(Final_reads) -0.0096706 0.0325805 72.89285 -0.2968225 0.7674461
Nasal swab * lyPMA -0.2226470 0.1190898 65.91555 -1.8695722 0.0659859
Sputum * lyPMA -0.0936705 0.1196459 62.61750 -0.7828973 0.4366384
Nasal swab * Benzonase -0.0512598 0.1146272 66.66595 -0.4471872 0.6561894
Sputum * Benzonase 0.0043312 0.1171417 63.75882 0.0369743 0.9706210
Nasal swab * Host zero -0.2539966 0.1113714 64.80534 -2.2806270 0.0258673
Sputum * Host zero -0.2117509 0.1182234 62.48276 -1.7911090 0.0781193
Nasal swab * Molysis -0.3660377 0.1160263 67.46898 -3.1547817 0.0023993
Sputum * Molysis -0.2802693 0.1195318 62.32165 -2.3447254 0.0222411
Nasal swab * QIAamp -0.1937200 0.1110482 64.48281 -1.7444684 0.0858422
Sputum * QIAamp -0.0639711 0.1171312 63.00030 -0.5461489 0.5868925

Shannon - stratified (NS)

QIAamp decreased alpha diversity of nasal swabs

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.9021024 0.1888432 28 4.7769924 0.0000510
lyPMA -0.0120429 0.0470035 28 -0.2562131 0.7996594
Benzonase 0.0493991 0.0446043 28 1.1074968 0.2775028
Host zero -0.0017956 0.0504576 28 -0.0355870 0.9718642
Molysis -0.0302035 0.0448416 28 -0.6735608 0.5061130
QIAamp -0.1176817 0.0541524 28 -2.1731574 0.0383841
log10(Final_reads) -0.0007741 0.0287078 28 -0.0269635 0.9786801

Shannon - stratified (BAL)

No changes found at BAL

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.3164981 0.4384386 19.99999 0.7218756 0.4787266
lyPMA 0.1566219 0.1595240 20.00000 0.9818075 0.3379233
Benzonase 0.0081445 0.1629779 20.00000 0.0499731 0.9606395
Host zero 0.1579336 0.1676636 20.00000 0.9419670 0.3574494
Molysis 0.2354040 0.1708188 20.00000 1.3780916 0.1833989
QIAamp -0.0173032 0.1712883 20.00000 -0.1010181 0.9205420
log10(Final_reads) 0.0484117 0.0775187 19.99999 0.6245166 0.5393481

Shannon - stratified (Spt)

Every covariate showed significant results against alpha diversity. Evenness of sputum affected by treatments.

lmer(data_shannon ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 1.9046747 0.3134676 20.38905 6.076146 0.0000056
lyPMA 0.2040040 0.0444997 19.33828 4.584393 0.0001943
Benzonase 0.2510413 0.0564000 19.52563 4.451090 0.0002582
Host zero 0.3368492 0.0953245 19.72386 3.533710 0.0021207
Molysis 0.4059052 0.1112411 19.75200 3.648878 0.0016224
QIAamp 0.2702564 0.0826606 19.68885 3.269471 0.0038945
log10(Final_reads) -0.1837105 0.0532273 19.82942 -3.451430 0.0025476

Simpson function

Inverse Simpson of all samples:

Inverse Simpson ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

Inverse Simpson ~ treatment + log10 (Final_reads) + (1|original_sample)

Inv Simp - ANOVA

p - value = 0.096 for the interaction term. Interaction term will be tested.

lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_invsimpson %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.0267838 0.0133919 2 6.835601 0.3224571 0.7348127
treatment 0.1762910 0.0352582 5 65.324395 0.8489653 0.5201930
log10(Final_reads) 0.3696492 0.3696492 1 70.881743 8.9006066 0.0039077
sample_type * treatment 0.7115045 0.0711505 10 64.116101 1.7131977 0.0968588

Inv. Simpson (all samples & interaction term)

Sample type specific effect was observed. Stratified anlysis required.

#Simpson

lmer_invsimpson %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.5041672 0.3100373 72.98246 8.0769868 0.0000000
Nasal swab 0.2824300 0.1602334 39.89109 1.7626164 0.0856259
Sputum 0.3733177 0.1575829 65.69179 2.3690243 0.0207839
lyPMA 0.3355368 0.1587771 66.80622 2.1132571 0.0383176
Benzonase 0.0394673 0.1582723 71.20768 0.2493632 0.8037978
Host zero 0.2243663 0.1607711 71.74004 1.3955639 0.1671506
Molysis 0.3476547 0.1624552 72.01996 2.1400038 0.0357425
QIAamp 0.0491110 0.1627061 72.05712 0.3018387 0.7636445
log10(Final_reads) -0.1584764 0.0531196 70.88174 -2.9833884 0.0039077
Nasal swab * lyPMA -0.4433734 0.2001303 67.80520 -2.2154242 0.0300927
Sputum * lyPMA -0.3749646 0.2026651 63.40668 -1.8501686 0.0689497
Nasal swab * Benzonase -0.0224270 0.1922834 68.59286 -0.1166351 0.9074900
Sputum * Benzonase -0.1906563 0.1979198 64.84986 -0.9633004 0.3389752
Nasal swab * Host zero -0.3133525 0.1877081 66.20325 -1.6693600 0.0997686
Sputum * Host zero -0.4532172 0.2003127 63.25108 -2.2625485 0.0271066
Nasal swab * Molysis -0.5486404 0.1942378 69.41775 -2.8245812 0.0061753
Sputum * Molysis -0.5345178 0.2025954 63.08079 -2.6383508 0.0104810
Nasal swab * QIAamp -0.1871327 0.1873220 65.70759 -0.9989895 0.3214646
Sputum * QIAamp -0.3129724 0.1982404 63.88243 -1.5787516 0.1193324

Inverse Simpson - stratified (NS)

Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)

Nasal swab showed changes after Molysis treatment

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.4341612 0.3741168 28 6.5064209 0.0000005
lyPMA -0.0792832 0.0931184 28 -0.8514237 0.4017574
Benzonase 0.0112922 0.0883655 28 0.1277900 0.8992286
Host zero -0.1324711 0.0999615 28 -1.3252214 0.1958106
Molysis -0.2120235 0.0888356 28 -2.3866962 0.0239973
QIAamp -0.1955251 0.1072813 28 -1.8225468 0.0790685
log10(Final_reads) -0.1054975 0.0568729 28 -1.8549701 0.0741586

Inverse Simpson - stratified (BAL)

No changes found at BAL

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.7079552 0.7370091 18.45226 3.6742494 0.0016771
lyPMA 0.3560120 0.2545271 18.46585 1.3987194 0.1784686
Benzonase 0.0769610 0.2668231 19.99899 0.2884345 0.7759824
Host zero 0.2671909 0.2760392 19.90623 0.9679453 0.3446858
Molysis 0.3937560 0.2821570 19.78512 1.3955212 0.1783255
QIAamp 0.0956828 0.2830622 19.76444 0.3380276 0.7389065
log10(Final_reads) -0.1967495 0.1328208 16.24623 -1.4813153 0.1576569

Inverse Simpson - stratified (spt)

Changes associated with deeper sequencing with sputum

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 4.2511748 0.4286288 20.78282 9.9180809 0.0000000
lyPMA 0.0879014 0.0615090 19.54384 1.4290812 0.1687705
Benzonase 0.0482351 0.0777588 19.91984 0.6203172 0.5420814
Host zero 0.1656569 0.1310623 20.31362 1.2639552 0.2205584
Molysis 0.2823279 0.1528856 20.36901 1.8466618 0.0793776
QIAamp 0.0699277 0.1137064 20.24448 0.6149849 0.5454161
log10(Final_reads) -0.3942383 0.0730735 20.52069 -5.3950901 0.0000256

Burger-Parker index Function

BPI of all samples:

BPI ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

BPI stratified:

BPI ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

BPI -ANOVA

p - value = 0.072 for the interaction term. Interaction term will be tested.

lmer_dbp <- lmer(dbp ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)

lmer_dbp %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.0031047 0.0015524 2 5.797922 0.2268168 0.8038192
treatment 0.0257612 0.0051522 5 65.443949 0.7527933 0.5870575
log10(Final_reads) 0.0718488 0.0718488 1 69.374684 10.4978010 0.0018365
sample_type * treatment 0.1252121 0.0125212 10 64.288564 1.8294710 0.0729245

BPI - all samples & interaction term

Sample type specific effect was observed. Stratified anlysis required.

#BPI
lmer_dbp %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.3494454 0.1248399 72.85764 2.7991472 0.0065536
Nasal swab -0.0949470 0.0634800 38.59201 -1.4957002 0.1428654
Sputum -0.1489322 0.0631257 67.11730 -2.3592951 0.0212271
lyPMA -0.1354895 0.0643160 67.14843 -2.1066210 0.0388903
Benzonase -0.0126454 0.0639374 71.58151 -0.1977775 0.8437797
Host zero -0.0763098 0.0649149 72.08571 -1.1755356 0.2436485
Molysis -0.1501657 0.0655753 72.34056 -2.2899747 0.0249412
QIAamp -0.0134435 0.0656737 72.37364 -0.2047007 0.8383806
log10(Final_reads) 0.0690874 0.0213231 69.37468 3.2400310 0.0018365
Nasal swab * lyPMA 0.1885595 0.0810249 68.19355 2.3271804 0.0229322
Sputum * lyPMA 0.1784163 0.0822236 63.57164 2.1698910 0.0337594
Nasal swab * Benzonase 0.0146779 0.0778149 68.96328 0.1886253 0.8509405
Sputum * Benzonase 0.0964409 0.0802472 65.06581 1.2017975 0.2337980
Nasal swab * Host zero 0.1083404 0.0760586 66.47624 1.4244332 0.1589988
Sputum * Host zero 0.1771872 0.0812745 63.42192 2.1801082 0.0329681
Nasal swab * Molysis 0.2291696 0.0785680 69.77276 2.9168336 0.0047530
Sputum * Molysis 0.2317119 0.0822065 63.25919 2.8186574 0.0064296
Nasal swab * QIAamp 0.0571663 0.0759204 65.94314 0.7529771 0.4541435
Sputum * QIAamp 0.1292876 0.0804119 64.06419 1.6078168 0.1127930

BPI - stratified (NS)

Molysis changed BPI

lmer(dbp ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal swab")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.4017357 0.1498449 28 2.6810102 0.0121638
lyPMA 0.0412304 0.0372967 28 1.1054691 0.2783650
Benzonase 0.0046045 0.0353930 28 0.1300974 0.8974195
Host zero 0.0504577 0.0400376 28 1.2602602 0.2179777
Molysis 0.0834844 0.0355813 28 2.3463015 0.0262687
QIAamp 0.0677258 0.0429693 28 1.5761426 0.1262256
log10(Final_reads) 0.0468284 0.0227793 28 2.0557472 0.0492388

BPI - stratified (BAL)

No changes

lmer(dbp ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.2445672 0.2956552 18.00596 0.8272041 0.4189496
lyPMA -0.1456413 0.1032851 18.57372 -1.4100912 0.1750396
Benzonase -0.0313798 0.1076447 19.98145 -0.2915125 0.7736642
Host zero -0.0977753 0.1112261 19.81800 -0.8790675 0.3898992
Molysis -0.1733099 0.1136103 19.64832 -1.5254772 0.1430763
QIAamp -0.0368287 0.1139635 19.62045 -0.3231626 0.7499909
log10(Final_reads) 0.0886954 0.0530745 15.42411 1.6711477 0.1148535

BPI - stratified (spt)

Changes only associated with higher final reads

lmer(dbp ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -0.2701952 0.1781304 20.96674 -1.5168392 0.1442409
lyPMA -0.0007038 0.0256334 19.62396 -0.0274558 0.9783735
Benzonase 0.0154609 0.0323782 20.05945 0.4775100 0.6381608
Host zero -0.0343045 0.0545240 20.51301 -0.6291638 0.5361864
Molysis -0.0792267 0.0635945 20.57650 -1.2458118 0.2268328
QIAamp 0.0014680 0.0473113 20.43364 0.0310278 0.9755484
log10(Final_reads) 0.1498736 0.0303847 20.74986 4.9325289 0.0000726

A8. Function beta diversity

Permanova (Taxa dist ~ log10(final reads) + sample_type + treatment + sample_type * treatment + subject_id) –> both stratified and nonstratified

Beta diversity figure

PCoA based on Bray-Curtis dissimilarities

bray_perm_uni_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_inter <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type * treatment + log10(Final_reads),
                                  data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), 
                                  strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                               data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab") %>%
                                       sample_data %>% data.frame(check.names = F),
                               strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab") %>% 
                                       sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_bal  <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="bray") ~  lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                 data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>% sample_data %>% data.frame(check.names = F),
                                 strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
                                         sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum") %>% sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
                                %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


ordinate(phyloseq_rel_nz,  method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_rel_nz, ., col = "treatment", shape = "sample_type" ) + 
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("Control","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_shape(name = "Sample type", labels = c("BAL", "Nasal swab", "Sputum")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm")) + #legend.position = c(0.9, 0.4)
        labs(tag = "E")

Beta diversity boxplot (Function)

Distances between samples within each subject. Mean distance between control <-> treatment for each subject

#distances of betadiversity - boxplots
bray_dist_long <- distance(phyloseq_rel_nz, method="bray") %>% as.matrix() %>% melt_dist() #making long data of distance matrices
#Adding sample type and treatment name. 
#this can be also done by merging metadata into the `bray_dist_long`
names <- data.frame(str_split_fixed(bray_dist_long$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(bray_dist_long$iso2, "_", 3))
bray_dist_long$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
bray_dist_long$method_1 <- ifelse(grepl("control", bray_dist_long$iso1),"control", 
                                  ifelse(grepl("lyPMA", bray_dist_long$iso1),"lypma", 
                                         ifelse(grepl("benzonase", bray_dist_long$iso1),"benzonase", 
                                                ifelse(grepl("host", bray_dist_long$iso1),"host_zero", 
                                                       ifelse(grepl("qia", bray_dist_long$iso1),"qiaamp", 
                                                              ifelse(grepl("moly", bray_dist_long$iso1),"molysis", 
                                                                     NA))))))
#Adding data for iso 2 also should be done
bray_dist_long$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
bray_dist_long$method_2 <-ifelse(grepl("control", bray_dist_long$iso2),"control", 
                                 ifelse(grepl("lyPMA", bray_dist_long$iso2),"lypma", 
                                        ifelse(grepl("benzonase", bray_dist_long$iso2),"benzonase", 
                                               ifelse(grepl("host", bray_dist_long$iso2),"host_zero", 
                                                      ifelse(grepl("qia", bray_dist_long$iso2),"qiaamp", 
                                                             ifelse(grepl("moly", bray_dist_long$iso2),"molysis", 
                                                                    NA))))))
#subsetting distances of my interest
path_bray_dist_long_within_sampleid <- subset(bray_dist_long, bray_dist_long$sample_id_1 == bray_dist_long$sample_id_2)
path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid, path_bray_dist_long_within_sampleid$method_1 == "control" | path_bray_dist_long_within_sampleid$method_2 == "control" )
path_bray_dist_long_within_sampleid_from_control$treatment <- path_bray_dist_long_within_sampleid_from_control$method_1
path_bray_dist_long_within_sampleid_from_control$treatment <- ifelse(path_bray_dist_long_within_sampleid_from_control$treatment == "control", path_bray_dist_long_within_sampleid_from_control$method_2, path_bray_dist_long_within_sampleid_from_control$treatment)
path_bray_dist_long_within_sampleid_from_control$sample_type <- ifelse(grepl("NS", path_bray_dist_long_within_sampleid_from_control$iso1), "nasal_swab",
                                                                  ifelse(grepl("CFB", path_bray_dist_long_within_sampleid_from_control$iso1), "Sputum",
                                                                         ifelse(grepl("BAL", path_bray_dist_long_within_sampleid_from_control$iso1), "BAL", NA)))

label <-  c("BAL","Nasal swab","Sputum")
names(label) <- c("BAL","nasal_swab","Sputum")

ggplot(path_bray_dist_long_within_sampleid_from_control, aes(y = dist, fill = treatment)) +
        geom_boxplot() +
        #scale_fill_manual(values = c(viridis(6)[2:6])) +
        scale_fill_manual(values = c("#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33"), name = "Treatment", labels = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Sample pair distances") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "F") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, labeller = labeller(sample_type = label))

Function PERMANOVA test results

Treatment as categorized group

dist ~ sample_type + log10(Final_reads) + treatment, strata = subject

No significant changes were observed.

bray_perm_uni_strata %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as strata term") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as strata term
Df SumOfSqs R2 F Pr(>F)
Sample type 2 1.557 0.344 35.378 0.000
log10(Final reads) 1 1.024 0.226 46.525 0.000
Treatment 5 0.124 0.027 1.124 0.525
Residual 83 1.826 0.403 NA NA
Total 91 4.530 1.000 NA NA

Function PERMAONVA (detailed treatment)

dist ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp, strata = subject

No significant changes were observed.

bray_perm_strata %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 2 1.557 0.344 35.378 0.000
log10(Final reads) 1 1.024 0.226 46.525 0.000
lyPMA 1 0.019 0.004 0.847 0.426
Benzonase 1 0.008 0.002 0.382 0.521
Host zero 1 0.008 0.002 0.372 0.565
Molysis 1 0.087 0.019 3.973 0.053
QIAamp 1 0.001 0.000 0.044 0.962
Residual 83 1.826 0.403 NA NA
Total 91 4.530 1.000 NA NA

QIAamp showed highest changes. But, it could be sample type specific.

Function interaction term

dist ~ sample_type * treatment + log10(Final_reads), strata = subject

Some changes were treatment induced?

We don’t have to run stratified anlysis

bray_perm_inter %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "sample_type:treatment" ~ 'Sample type * treatment',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 2 1.557 0.344 38.648 0.000
Treatment 5 0.684 0.151 6.791 0.000
log10(Final reads) 1 0.463 0.102 23.010 0.001
Sample type * treatment 10 0.356 0.079 1.767 0.128
Residual 73 1.470 0.325 NA NA
Total 91 4.530 1.000 NA NA

Stratified (NS)

Stratified analysis not matching with boxplot results

bray_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.011 0.042 2.193 0.145
Benzonase 1 0.011 0.040 2.076 0.147
Host zero 1 0.009 0.033 1.728 0.189
Molysis 1 0.014 0.052 2.746 0.088
QIAamp 1 0.054 0.197 10.324 0.002
log10(Final reads) 1 0.028 0.103 5.405 0.023
Residual 28 0.147 0.534 NA NA
Total 34 0.275 1.000 NA NA

Stratified (BAL)

Stratified analysis not matching with boxplot results

bray_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.040 0.020 0.800 0.344
Benzonase 1 0.115 0.058 2.279 0.091
Host zero 1 0.027 0.014 0.543 0.403
Molysis 1 0.180 0.091 3.569 0.046
QIAamp 1 0.001 0.000 0.018 0.938
log10(Final reads) 1 0.603 0.306 11.980 0.082
Residual 20 1.007 0.510 NA NA
Total 26 1.973 1.000 NA NA

Stratified (spt)

Stratified analysis not matching with boxplot results

bray_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.017 0.023 3.773 0.062
Benzonase 1 0.002 0.003 0.423 0.580
Host zero 1 0.064 0.089 14.506 0.004
Molysis 1 0.139 0.191 31.291 0.000
QIAamp 1 0.370 0.509 83.348 0.000
log10(Final reads) 1 0.032 0.045 7.303 0.010
Residual 23 0.102 0.141 NA NA
Total 29 0.726 1.000 NA NA

Results:

A9. DA analysis for taxa, by sample type and treatment

Both stratified and nonstratified were conducted.

MaAsLin condition:

Transformation: log transform

Normalization: None - as functional hits were normalized as RPKM already.

https://forum.biobakery.org/t/maaslin-with-shortbred-results-and-panphlan/3102

Results

#DA analysis - MaAslin
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)

#Running MaAslin for all sample without decontam
#for taxa differentially abundant by host depletion method, look to see which ones overlap with potential contaminant taxa

# Maaslin - # # y ~ log(final reads) + sample_type + treatment  -----------

#all samples

MaAslin without interaction - volcano plot

Again, most of DA functions were sample type specific

#Making significance table for figure
        # Define a function to make species names italicized
# Make a significance table for each figure (top 20 taxa)

make_sig_table <- function(data) {
  sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
  sig_data$min <- apply(sig_data, 1, FUN = min)
  sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
  sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
  sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>% column_to_rownames(var = "feature") %>% select(-c("-"))
  sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame %>% 
          rename(lyPMA = lypma,  Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
  return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}

fit_data_bal <- make_sig_table(fit_data_bal)
fit_data_ns <- make_sig_table(fit_data_ns)
fit_data_spt <- make_sig_table(fit_data_spt)

spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Sputum")) %>%
                               gsub("-", ".", .) %in% fit_data_spt$data$feature)
fit_data_spt$rel <- cbind(spt_sig %>% otu_table %>% t, spt_sig %>% sample_data) %>% 
        group_by(treatment) %>%
        summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>%
        column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        rownames_to_column("row") %>% mutate(row = gsub("-", ".", row)) %>% column_to_rownames("row") %>%
        .[row.names(fit_data_spt$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

ns_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Nasal swab"))%>%
                               gsub("-", ".", .) %in% fit_data_ns$data$feature)

fit_data_ns$rel <- cbind(ns_sig %>% otu_table %>% t, ns_sig %>% sample_data) %>% 
        group_by(treatment) %>%
        summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>%
        column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        rownames_to_column("row") %>% mutate(row = gsub("-", ".", row)) %>% column_to_rownames("row") %>%
        .[row.names(fit_data_ns$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

bal_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "BAL")) %>%
                               gsub("-", ".", .) %in% fit_data_bal$data$feature)
fit_data_bal$rel <- cbind(bal_sig %>% otu_table %>% t, bal_sig %>% sample_data) %>% 
        group_by(treatment) %>%
        summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>%
        column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        rownames_to_column("row") %>% mutate(row = gsub("-", ".", row)) %>% column_to_rownames("row") %>%
        .[row.names(fit_data_bal$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

#Volcano plot

ggplot(maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
        theme_classic(base_family = "serif") +
        labs(tag = "A") +
        geom_point(size = 2) +
        xlab("MaAslin coefficient") +
        ylab("-log<sub>10</sub>(*q*-value)") +
        geom_hline(yintercept = 1, col = "gray") +
        geom_vline(xintercept = 0, col = "gray") +
        geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
        theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
        scale_color_manual(values = c("#4daf4a",  "#984ea3", "#f781bf", "#377eb8", "#ff7f00", "#ffff33", "#a65628"),
                           breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero",  "molysis", "qiaamp"), 
                           labels = c("log10 (Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero",  "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))

Most of the DA function were sample type dependent.

MaAsLin table (function)

Large number of functions were differentially aubundant.

maaslin_all %>% subset(., .$qval < 0.1) %>% .$metadata %>% table
## .
##         benzonase         host_zero log10.Final_reads             lypma 
##               100               125               162                90 
##           molysis            qiaamp       sample_type 
##               185                66               288

Stratified analysis is required.

Baloon plot - nasal swabs

Similarly, few functions were newly discovered

ggballoonplot(fit_data_ns$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "B") +
        theme(panel.grid.major = element_line(colour = "grey"), axis.text.x = element_text(vjust = 0.5, hjust=0.5), axis.text.y = ggtext::element_markdown())  +
        geom_text(data = fit_data_ns$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Baloon plot - Sputum

Similarly, few functions were newly discovered

ggballoonplot(fit_data_spt$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "C") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "none", axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              axis.text.y = ggtext::element_markdown())  +
        geom_text(data = fit_data_spt$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Baloon plot - BAL

Some functions were newly discovered.

ggballoonplot(fit_data_bal$rel %>% gather(treatment, value, Control:QIAamp, factor_key=TRUE), fill = "value", y = "feature", x= "treatment") +
        theme_classic(base_family = "serif") +
        scale_fill_viridis() +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "none", axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              axis.text.y = ggtext::element_markdown())  +
        geom_text(data = fit_data_bal$data_sig %>%
                          rownames_to_column("feature") %>%
                          gather(treatment, significance, lyPMA:QIAamp, factor_key=TRUE) %>%
                          subset(., !is.na(.$significance)),
                  aes(y = feature, x = treatment, label = significance, col = significance), hjust = -2, vjust = 0.8, size = 5) +
        guides(col = guide_legend(nrow = 3, override.aes = aes(label = "*", size = 10, color = "red"), title="Significance", title.position = "top", order = 3), size = guide_legend(title = "Relative abundance", title.position = "top", order = 1, nrow = 2), fill = F) + 
        scale_x_discrete(labels=c("control" = "Control", "lypma" = "lyPMA", "benzonase" = "Benzonase", "host_zero" = "Host-zero", "molysis" = "Molysis", "qiaamp" = "QIAamp")) +
        scale_color_manual(values = c("red"), labels = c(expression(paste(italic("q"), "-value < 0.1", sep = ""))))

Results After adding control data, MaAslin needs to be reanalyzed. Adding controls (mock communities) for each treatment group will show more statistically valid results in y ~ log(final reads) + sample_type + treatment, (re = subject_id))

MaAslin with interaction

Figure and table of differentially abundant function.

#Generating interaction term
sample_data(phyloseq_rel_nz)$sampletype_treatment <- paste(sample_data(phyloseq_rel_nz)$sample_type, sample_data(phyloseq_rel_nz)$treatment, sep = "*")
#interaction term - ggplot
ggplot(maaslin_interaction, aes(y = -log10(qval), x = coef, col = metadata)) +
         theme_classic(base_family = "serif") +
         #labs(tag = "A") +
         ggtitle("MaAslin with interaction term")+
         geom_point(size = 2) +
         xlab("MaAslin coefficient") +
         ylab("-log<sub>10</sub>(*q*-value)") +
         geom_hline(yintercept = 1, col = "gray") +
         geom_vline(xintercept = 0, col = "gray") +
         geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
         theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
         scale_color_manual(values = c("#e41a1c",  "#377eb8", "#4daf4a", "#984ea3")) +
         guides(col = guide_legend(title = "Fixed effects", title.position = "top", nrow = 1))

#Checking number of bugs differentially abundance with interaction term 
cat("Number of differentially abundant bugs by each metadata")
## Number of differentially abundant bugs by each metadata
maaslin_interaction %>% subset(., .$qval < 0.1) %>% .$metadata %>% table()
## .
##    log10.Final_reads          sample_type sampletype_treatment 
##                   74                  262                  513 
##            treatment 
##                  189

MaAsLin interaction analysis

 cat("Some taxa were increased by each treatmment.\n But they are not contaminants, \nif they are present in most of the treatments")
## Some taxa were increased by each treatmment.
##  But they are not contaminants, 
## if they are present in most of the treatments
 maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>% .$feature %>% table %>% data.frame %>% arrange(-Freq) %>% rename(Feature = ".") %>% kbl(format = "html", caption = "Table of taxa differentially abundant by treatment") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa differentially abundant by treatment
Feature Freq
PWY.7323 5
PPGPPMET.PWY 4
PWY.6519 4
PWY.7204 4
PWY0.1479 4
PWY0.1586 4
COLANSYN.PWY 3
FAO.PWY 3
GLYOXYLATE.BYPASS 3
OANTIGEN.PWY 3
P105.PWY 3
P4.PWY 3
PWY.1269 3
PWY.5136 3
PWY.5675 3
PWY.5690 3
PWY.5747 3
PWY.5855 3
PWY.5856 3
PWY.5857 3
PWY.6318 3
PWY.6708 3
PWY.6969 3
PWY.7663 3
PWY0.781 3
TCA.GLYOX.BYPASS 3
TYRFUMCAT.PWY 3
FASYN.ELONG.PWY 2
FERMENTATION.PWY 2
GLUCONEO.PWY 2
GLYCOGENSYNTH.PWY 2
GLYCOLYSIS.E.D 2
GLYCOLYSIS.TCA.GLYOX.BYPASS 2
HOMOSER.METSYN.PWY 2
MET.SAM.PWY 2
METSYN.PWY 2
NAGLIPASYN.PWY 2
PWY.241 2
PWY.5083 2
PWY.5347 2
PWY.5676 2
PWY.5695 2
PWY.5913 2
PWY.5973 2
PWY.6282 2
PWY.6595 2
PWY.6606 2
PWY.6608 2
PWY.6703 2
PWY.7282 2
PWY.7664 2
PWY0.1261 2
PWY0.845 2
PWY4FS.7 2
PWY4FS.8 2
PWY66.389 2
PYRIDOXSYN.PWY 2
ASPASN.PWY 1
BIOTIN.BIOSYNTHESIS.PWY 1
CALVIN.PWY 1
CITRULBIO.PWY 1
GLYCOL.GLYOXDEG.PWY 1
PENTOSE.P.PWY 1
PWY.1042 1
PWY.4984 1
PWY.5100 1
PWY.5104 1
PWY.5154 1
PWY.5173 1
PWY.5189 1
PWY.5265 1
PWY.5345 1
PWY.561 1
PWY.5667 1
PWY.5871 1
PWY.5873 1
PWY.5989 1
PWY.6126 1
PWY.6353 1
PWY.6471 1
PWY.6545 1
PWY.6700 1
PWY.6859 1
PWY.6936 1
PWY.7198 1
PWY.7229 1
PWY.7237 1
PWY.7254 1
PWY.841 1
PWY0.1319 1
PWY0.862 1
PWY1G.0 1
PWY3O.19 1
PWY66.399 1
PWY66.400 1
PWY66.409 1
PWYG.321 1
SALVADEHYPOX.PWY 1
 cat("Most of taxa were found on most of treatments.")
## Most of taxa were found on most of treatments.
 cat("Some taxa were treatment specific, only to one group")
## Some taxa were treatment specific, only to one group
subset(maaslin_interaction, maaslin_interaction$feature %in%  (maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>%
         .$feature %>% table %>% data.frame %>% subset(., Freq == 1) %>% .$. %>% as.character())) %>% subset(., .$qval < 0.1) %>% select(c("feature", "metadata", "value", "coef", "qval")) %>% subset(., .$metadata == "treatment") %>%
        remove_rownames() %>% kbl(format = "html", caption = "Table of taxa specific to one treatment group") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa specific to one treatment group
feature metadata value coef qval
PWY66.399 treatment lyPMA -2.854232 0.0028504
PWY.6700 treatment lyPMA 4.327262 0.0036901
ASPASN.PWY treatment lyPMA 4.401548 0.0043963
PWY.4984 treatment lyPMA 4.156375 0.0048323
PWY.5104 treatment lyPMA 3.954702 0.0064690
PWY66.400 treatment lyPMA -2.253399 0.0098358
PWY.5173 treatment lyPMA 3.918380 0.0103995
PWY.7229 treatment lyPMA -2.166633 0.0111767
PWY.7198 treatment lyPMA -1.914683 0.0118071
PWY.7237 treatment QIAamp 3.951831 0.0140510
PWY.5989 treatment Molysis 3.862064 0.0162354
PWY.6126 treatment lyPMA -2.066399 0.0163405
PWY.841 treatment lyPMA -2.134628 0.0192771
PWY.5154 treatment lyPMA 3.908505 0.0206660
PWY.6545 treatment lyPMA -3.876545 0.0236477
PENTOSE.P.PWY treatment lyPMA -2.504045 0.0237785
PWY.5100 treatment lyPMA 2.926325 0.0239820
PWY.5871 treatment QIAamp 2.286020 0.0262192
PWY.5873 treatment QIAamp 2.286020 0.0262192
PWY0.862 treatment Molysis 3.858948 0.0360961
PWY.561 treatment Host zero 3.679261 0.0364917
PWY.6859 treatment lyPMA -2.365648 0.0434897
CALVIN.PWY treatment lyPMA 2.022717 0.0485584
PWY0.1319 treatment QIAamp -1.642528 0.0486151
PWY.5667 treatment QIAamp -1.638861 0.0489790
PWY.5345 treatment Molysis 2.474248 0.0501101
CITRULBIO.PWY treatment lyPMA 2.924422 0.0501892
PWYG.321 treatment Molysis 3.590886 0.0574639
PWY.6936 treatment QIAamp -1.478388 0.0591830
PWY.7254 treatment Molysis 3.637160 0.0640160
PWY.1042 treatment lyPMA -1.599141 0.0657547
PWY.6353 treatment lyPMA 2.903217 0.0658259
BIOTIN.BIOSYNTHESIS.PWY treatment Host zero 3.114788 0.0691854
PWY3O.19 treatment QIAamp 2.074899 0.0738120
PWY.5189 treatment lyPMA -1.591600 0.0759409
SALVADEHYPOX.PWY treatment lyPMA 2.710844 0.0771021
GLYCOL.GLYOXDEG.PWY treatment Benzonase 1.709435 0.0777447
PWY.6471 treatment Molysis 2.917857 0.0784918
PWY66.409 treatment lyPMA -1.869095 0.0789112
PWY.5265 treatment Molysis 2.679060 0.0908175
PWY1G.0 treatment QIAamp -3.428476 0.0934398

Final results summary

Sequencing results

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c("No increase in final reads",
                                                     "No increase in final reads",
                                                     "No increase in final reads"),
                                           Benzonase = c("No decrease in host %",
                                                         "No decrease in host %",
                                                         "No decrease in host %"),
                                           `Host zero` = c(NA,
                                                           NA,
                                                           NA),
                                           Molysis = c("No decrease in host %",
                                                       "High cahnge of failure in library pep",
                                                       NA),
                                           QIAamp = c("No decrease in host %",
                                                      NA,
                                                      "No decrease in host %")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of issues of each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of issues of each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL No increase in final reads No decrease in host % NA No decrease in host % No decrease in host %
Nasal swab No increase in final reads No decrease in host % NA High cahnge of failure in library pep NA
Sputum No increase in final reads No decrease in host % NA NA No decrease in host %

Diversity changes (taxa)

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c(NA,
                                                           "Beta changed",
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Richness + Shannon + InvSimp +"),
                                           `Host zero` = c(NA,
                                                           "Richness + Shannon + InvSimp + BPI -",
                                                           NA),
                                           Molysis = c(NA,
                                                           "Richness + Shannon + InvSimp + BPI -",
                                                           "Beta changed"),
                                           QIAamp = c("Beta changed",
                                                           NA,
                                                           "Beta  changed")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of community changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of community changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA Beta changed
Nasal swab Beta changed NA Richness + Shannon + InvSimp + BPI - Richness + Shannon + InvSimp + BPI - NA
Sputum Shannon + Richness + Shannon + InvSimp + NA Beta changed Beta changed

Diversity changes (function)

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c(NA,
                                                           NA,
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Shannon +"),
                                           `Host zero` = c(NA,
                                                           "Richness +",
                                                           "Shannon +"),
                                           Molysis = c(NA,
                                                           "Richness + InvSimp + BPI +",
                                                           "Shannon +"),
                                           QIAamp = c(NA,
                                                           "Richness + Shannon +",
                                                           "Shannon +")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of functional diversity changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of functional diversity changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA NA
Nasal swab NA NA Richness + Richness + InvSimp + BPI + Richness + Shannon +
Sputum Shannon + Shannon + Shannon + Shannon + Shannon +

Potential contaminants

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal swab", "Sputum"),
                                           lyPMA = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           Benzonase = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           `Host zero` = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           Molysis = c("Streptococcaceae, Listeria",
                                                       "Streptococcaceae, Listeria",
                                                           "Streptococcaceae, Listeria, Candida, Corynebacterium"),
                                           QIAamp = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of potential contaminants identified by decontam and DA analysis") %>%
        kable_styling(full_width = 0, html_font = "serif") %>%
        column_spec(2:6, italic = T) #%>%
Table of potential contaminants identified by decontam and DA analysis
lyPMA Benzonase Host zero Molysis QIAamp
BAL Listeria Listeria Listeria Streptococcaceae, Listeria Listeria
Nasal swab Listeria Listeria Listeria Streptococcaceae, Listeria Listeria
Sputum Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium Streptococcaceae, Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium
  #row_spec(2:3, bold = T)

Conclusion

1. Effect of treatment was sample type specific.

2. Some methods (lyPMA) made samples failing in library prep.

3. One BAL sample failed in sequencing, but most of treatment enabled its sequencing

4. Alpha diversity and beta diversity were changed by some treatment, specific to some sample type.

5. DA analysis and decontam showed there were some potential contaminants

QIAamp for nasal swab, host zero for BAL and sputum successfully 1) incrased final reads, 2) lowered host %, and 3) did not change diversity matrices.

Molysis was effective in increasing efficiencies of sequencing sptum, however diversity matrices were significantly changed.

As our study contains potential contaminants, further analysis is required after adding data of controls.

Done.

Bibliography

#===============================================================================
#BTC.LineZero.Footer.1.1.0
#===============================================================================
#R markdown citation generator.
#===============================================================================
#RLB.Dependencies:
#   magrittr, pacman, stringr
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#BTC.Dependencies:
#   LineZero.Header
#===============================================================================
#Generates citations for each explicitly loaded library.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
str_libraries <- c("r", str_libraries)
for (str_libraries in str_libraries) {
    str_libraries |>
        pacman::p_citation() |>
        print(bibtex = FALSE) |>
        capture.output() %>%
        .[-1:-3] %>% .[. != ""] |>
        stringr::str_squish() |>
        stringr::str_replace("_", "") |>
        cat()
    cat("\n")
}
## R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Wickham H, Bryan J (2023). readxl: Read Excel Files_. R package version 1.4.2, <https://CRAN.R-project.org/package=readxl>.
## phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## Rinker, T. W. & Kurkiewicz, D. (2017). pacman: Package Management for R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## Garbett SP, Stephens J, Simonov K, Xie Y, Dong Z, Wickham H, Horner J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
## Oksanen J, Simpson G, Blanchet F, Kindt R, Legendre P, Minchin P, O'Hara R, Solymos P, Stevens M, Szoecs E, Wagner H, Barbour M, Bedward M, Bolker B, Borcard D, Carvalho G, Chirico M, De Caceres M, Durand S, Evangelista H, FitzJohn R, Friendly M, Furneaux B, Hannigan G, Hill M, Lahti L, McGlinn D, Ouellette M, Ribeiro Cunha E, Smith T, Stier A, Ter Braak C, Weedon J (2022). vegan: Community Ecology Package. R package version 2.6-4, <https://CRAN.R-project.org/package=vegan>.
## Leo Lahti et al. microbiome R package. URL: http://microbiome.github.io
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.6.2.
## Davis NM, Proctor D, Holmes SP, Relman DA, Callahan BJ (2017). "Simple statistical identification and removal of contaminant sequences in marker-gene and metagenomics data." bioRxiv_, 221499. doi:10.1101/221499 <https://doi.org/10.1101/221499>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Douglas Bates, Martin Maechler, Ben Bolker, Steve Walker (2015). Fitting Linear Mixed-Effects Models Using lme4. Journal of Statistical Software, 67(1), 1-48. doi:10.18637/jss.v067.i01.
## Kuznetsova A, Brockhoff PB, Christensen RHB (2017). "lmerTest Package: Tests in Linear Mixed Effects Models." Journal of Statistical Software, *82*(13), 1-26. doi:10.18637/jss.v082.i13 <https://doi.org/10.18637/jss.v082.i13>.
## Ooms J (2023). writexl: Export Data Frames to Excel 'xlsx' Format_. R package version 1.4.2, <https://CRAN.R-project.org/package=writexl>.
## Gonçalves da Silva A (2017). harrietr: Wrangle Phylogenetic Distance Matrices and Other Utilities. R package version 0.2.3, <https://CRAN.R-project.org/package=harrietr>.
## Mallick H et al. (2020). Multivariable Association in Population-scale Meta-omics Studies, http://huttenhower.sph.harvard.edu/maaslin2. To cite the MaAsLin 2 software, please use: Mallick H, Rahnavard A, McIver LJ (2020). MaAsLin 2: Multivariable Association in Population-scale Meta-omics Studies. R/Bioconductor package, http://huttenhower.sph.harvard.edu/maaslin2.
## Wilke C, Wiernik B (2022). ggtext: Improved Text Rendering Support for 'ggplot2'. R package version 0.1.2, <https://CRAN.R-project.org/package=ggtext>.
## Aphalo P (2022). ggpmisc: Miscellaneous Extensions to 'ggplot2'_. R package version 0.5.2, <https://CRAN.R-project.org/package=ggpmisc>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Wood S, Scheipl F (2020). gamm4: Generalized Additive Mixed Models using 'mgcv' and 'lme4'. R package version 0.2-6, <https://CRAN.R-project.org/package=gamm4>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Hadley Wickham (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21(12), 1-20. URL http://www.jstatsoft.org/v21/i12/.
## Zhu H (2021). kableExtra: Construct Complex Table with 'kable' and Pipe Syntax. R package version 1.3.4, <https://CRAN.R-project.org/package=kableExtra>.
## Yihui Xie (2023). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.42. Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition. Chapman and Hall/CRC. ISBN 978-1498716963 Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible Research in R. In Victoria Stodden, Friedrich Leisch and Roger D. Peng, editors, Implementing Reproducible Computational Research. Chapman and Hall/CRC. ISBN 978-1466561595
## Guangchuang Yu. (2022). Data Integration, Manipulation and Visualization of Phylogenetic Trees (1st edition). Chapman and Hall/CRC. Shuangbin Xu, Lin Li, Xiao Luo, Meijun Chen, Wenli Tang, Li Zhan, Zehan Dai, Tommy T. Lam, Yi Guan, Guangchuang Yu. Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data. iMeta 2022, 4(1):e56. doi:10.1002/imt2.56 Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics, 2020, 69:e96. doi: 10.1002/cpbi.96 Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(2):3041-3043. doi: 10.1093/molbev/msy194 Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## John Fox and Sanford Weisberg (2019). An {R} Companion to Applied Regression, Third Edition. Thousand Oaks CA: Sage. URL: https://socialsciences.mcmaster.ca/jfox/Books/Companion/
#===============================================================================